# 导入包
from pyspark.sql import SparkSession
import findspark
findspark.init() # 据说这个包能够自动找到机器的spark路径,但实测后不好用
# 添加spark环境变量
os.environ['SPARK_HOME'] = "/Library/Hadoop/spark-3.0.0-bin-hadoop3.2"
#
# # Append pyspark to Python Path
sys.path.append("/Library/Hadoop/spark-3.0.0-bin-hadoop3.2/python/lib/py4j-0.10.9-src.zip")
spark = SparkSession.builder.appName('text').getOrCreate() # 之前说过的sparksession入口
sp = [
{'a':1,'b':'b','c':1.1},{'a':1,'b':'b','c':1.1}
]
df = spark.createDataFrame(sp) # 创建dataframe
# df = pd.DataFrame(sp) # 创建pandas的dataframe
# print(df)
# 创建pandas 的dataframe,
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark.createDataFrame(xin).show() # pandas的dataframe转成spark的dataframe
df.createOrReplaceTempView('table1') # dataframe转成临时表
spark.sql('select * from table1').show() # 执行sql并打印
- filter 筛选条件,类似sql的where
xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(xin)
df.filter(df.d1>2).show()
》》》
+---+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1| time|
+---+---+---+---+---+---+--------------------+
| ai|3.0| 3| 4| 5| 6|2022-08-05 19:00:...|
| ai|NaN| 4| 4| 5| 6|2022-08-05 19:00:...|
+---+---+---+---+---+---+--------------------+
- -collect 以列表得形式返回行 非常不建议
xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(xin)
for i in df.collect():
print(i)
print(list(i))
》》》
Row(a1='ai', b1=0.0, c1=5, d1=2, e1='5', f1='6', time=datetime.datetime(2022, 8, 5, 19, 2, 44, 680251))
['ai', 0.0, 5, 2, '5', '6', datetime.datetime(2022, 8, 5, 19, 2, 44, 680251)]
Row(a1='ai', b1=3.0, c1=3, d1=4, e1='5', f1='6', time=datetime.datetime(2022, 8, 5, 19, 2, 44, 680251))
['ai', 3.0, 3, 4, '5', '6', datetime.datetime(2022, 8, 5, 19, 2, 44, 680251)]
Row(a1='ai', b1=nan, c1=4, d1=4, e1='5', f1='6', time=datetime.datetime(2022, 8, 5, 19, 2, 44, 680251))
['ai', nan, 4, 4, '5', '6', datetime.datetime(2022, 8, 5, 19, 2, 44, 680251)]
-
toLocalIterator 返回一个迭代器,占用的最大资源为rdd一个分区的资源,返回的和collect 一样。建议使用
-
count 略
xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(xin)
print(df.count())
》》》
3
- columns 列表得形式返回列名
xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(xin)
print(df.columns)
》》》》
['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time']
- dtypes 列表得形式返回列名和类型
xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(xin)
print(df.dtypes)
》》》
[('a1', 'string'), ('b1', 'double'), ('c1', 'bigint'), ('d1', 'bigint'), ('e1', 'string'), ('f1', 'string'), ('time', 'timestamp')]
- describe 返回列得基础信息
xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(xin)
df.describe().show()
》》》》
+-------+----+---+---+------------------+---+---+
|summary| a1| b1| c1| d1| e1| f1|
+-------+----+---+---+------------------+---+---+
| count| 3| 3| 3| 3| 3| 3|
| mean|null|NaN|4.0|3.3333333333333335|5.0|6.0|
| stddev|null|NaN|1.0|1.1547005383792515|0.0|0.0|
| min| ai|0.0| 3| 2| 5| 6|
| max| ai|NaN| 5| 4| 5| 6|
+-------+----+---+---+------------------+---+---+
df = spark.createDataFrame(xin)
df.describe(['a1']).show()
》》》
+-------+----+
|summary| a1|
+-------+----+
| count| 3|
| mean|null|
| stddev|null|
| min| ai|
| max| ai|
+-------+----+
- select 选取指定的列
- sort 按照一定的顺序呈现
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.select('b1').show()
>>>
+---+
| b1|
+---+
| 0|
| 3|
| 2|
+---+
- first() 查看第一条数据,返回rdd
- head(n),查看前n条数据
- freqItems 查看指定的列的枚举值
- summary 在describe的基础上增加了固定百分比的展示
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.summary().show()
>>>
+-------+----+------------------+------------------+---+---+---+--------------------+
|summary| a1| b1| c1| d1| e1| f1| time|
+-------+----+------------------+------------------+---+---+---+--------------------+
| count| 3| 3| 3| 3| 3| 3| 3|
| mean|null|1.6666666666666667|3.6666666666666665|4.0|5.0|6.0| null|
| stddev|null|1.5275252316519468|1.1547005383792515|0.0|0.0|0.0| null|
| min| ai| 0| 3| 4| 5| 6|2022-08-06 16:37:...|
| 25%|null| 0| 3| 4|5.0|6.0| null|
| 50%|null| 2| 3| 4|5.0|6.0| null|
| 75%|null| 3| 5| 4|5.0|6.0| null|
| max|java| 3| 5| 4| 5| 6|2022-08-06 16:37:...|
+-------+----+------------------+------------------+---+---+---+--------------------+
- distinct 对数据去重
- dropDuplicates 根据指定的列进行去重
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.dropDuplicates(['a1']).show()
>>>
+----+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1| time|
+----+---+---+---+---+---+--------------------+
| ai| 0| 5| 4| 5| 6|2022-08-06 16:40:...|
|java| 2| 3| 4| 5| 6|2022-08-06 16:40:...|
+----+---+---+---+---+---+--------------------+
- exceptAll 根据指定的df对df去重,相当于先union all,在去重,但是测试后发现有问题,具体执行逻辑有待研究
df1 = spark.createDataFrame(
[("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
df3 = df1.exceptAll(df2)
df1.show()
df2.show()
df3.show()
>>>
df1
+---+---+
| C1| C2|
+---+---+
| a| 1|
| a| 1|
| b| 3|
| c| 4|
+---+---+
df2
+---+---+
| C1| C2|
+---+---+
| a| 1|
| b| 3|
+---+---+
df3
+---+---+
| C1| C2|
+---+---+
| a| 1|
| c| 4|
+---+---+
- subtract根据指定的df对df去重,相当于去掉df2里出现过的df1里的任何数据,只留下没有出现的
df1 = spark.createDataFrame(
[("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
df3 = df1.subtract(df2)
df1.show()
df2.show()
df3.show()
>>>
+---+---+
| C1| C2|
+---+---+
| a| 1|
| a| 1|
| b| 3|
| c| 4|
+---+---+
+---+---+
| C1| C2|
+---+---+
| a| 1|
| b| 3|
+---+---+
+---+---+
| C1| C2|
+---+---+
| c| 4|
+---+---+
- intersectAll 返回两个df的交集
df1 = spark.createDataFrame(
[("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
df3 = df1.intersectAll(df2)
df1.show()
df2.show()
df3.show()
>>>
+---+---+
| C1| C2|
+---+---+
| a| 1|
| a| 1|
| b| 3|
| c| 4|
+---+---+
+---+---+
| C1| C2|
+---+---+
| a| 1|
| b| 3|
+---+---+
+---+---+
| C1| C2|
+---+---+
| b| 3|
| a| 1|
+---+---+
- drop 删除指定列
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.drop('a1').show()
>>>
+---+---+---+---+---+--------------------+
| b1| c1| d1| e1| f1| time|
+---+---+---+---+---+--------------------+
| 0| 5| 4| 5| 6|2022-08-06 16:54:...|
| 3| 3| 4| 5| 6|2022-08-06 16:54:...|
| 2| 3| 4| 5| 6|2022-08-06 16:54:...|
+---+---+---+---+---+--------------------+
- withColumn 新增列
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.withColumn('g1', df.b1 * 10).show()
>>>
+----+---+---+---+---+---+--------------------+---+
| a1| b1| c1| d1| e1| f1| time| g1|
+----+---+---+---+---+---+--------------------+---+
| ai| 0| 5| 4| 5| 6|2022-08-06 16:56:...| 0|
| ai| 3| 3| 4| 5| 6|2022-08-06 16:56:...| 30|
|java| 2| 3| 4| 5| 6|2022-08-06 16:56:...| 20|
+----+---+---+---+---+---+--------------------+---+
- withColumnRenamed 重命名列
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.withColumnRenamed('a1','A1').show()
>>>
+----+---+---+---+---+---+--------------------+
| A1| b1| c1| d1| e1| f1| time|
+----+---+---+---+---+---+--------------------+
| ai| 0| 5| 4| 5| 6|2022-08-06 16:57:...|
| ai| 3| 3| 4| 5| 6|2022-08-06 16:57:...|
|java| 2| 3| 4| 5| 6|2022-08-06 16:57:...|
+----+---+---+---+---+---+--------------------+
- dropna 丢弃空值,类似于pandas的dropna,而且,连np.NAN也能识别,这算是一个很好的交互了
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.dropna(how='all', subset=['b1']).show()
>>>
+---+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1| time|
+---+---+---+---+---+---+--------------------+
| ai|0.0| 5| 4| 5| 6|2022-08-06 17:00:...|
| ai|3.0| 3| 4| 5| 6|2022-08-06 17:00:...|
+---+---+---+---+---+---+--------------------+
- fillna 填充空值,类似于pandas的fillna
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.fillna({'b1':2}).show()
>>>
+----+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1| time|
+----+---+---+---+---+---+--------------------+
| ai|0.0| 5| 4| 5| 6|2022-08-06 17:03:...|
| ai|3.0| 3| 4| 5| 6|2022-08-06 17:03:...|
|java|2.0| 3| 4| 5| 6|2022-08-06 17:03:...|
+----+---+---+---+---+---+--------------------+
- join 类似pandas的merge,不过多赘述了
df1 = spark.createDataFrame(
[("a", 1), ("d", 1), ("b", 3), ("c", 4)], ["id", "num1"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["id", "num2"])
df1.join(df2, df1.id == df2.id, 'left').select(df1.id.alias("df1_id"),
df1.num1.alias("df1_num"),
df2.num2.alias("df2_num")
).sort(["df1_id"], ascending=False) \
.show()
>>>
+------+-------+-------+
|df1_id|df1_num|df2_num|
+------+-------+-------+
| d| 1| null|
| c| 4| null|
| b| 3| 3|
| a| 1| 1|
+------+-------+-------+
- agg(*exprs) 聚合类,可以写多个方法,通常配合groupBy使用
- alias 对列或df设置别名
- groupBy 分组
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.groupBy(['a1']).agg(functions.min(df.c1).alias('three'),functions.expr('avg(d1)').alias('avg')).show()
>>>
+----+-----+---+
| a1|three|avg|
+----+-----+---+
| ai| 3|4.0|
|java| 3|4.0|
+----+-----+---+
- foreach 类似于pandas的apply,对每一行执行方法,但是实测没有apply强大,首先apply能对行操作,也能对列操作,foreach只能对行,其次,foreach不能改变其中的内容,
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.foreach(lambda x: print(x.a1))
>>>
java
ai
ai
- replace,替换,传入字典,{‘old’:‘new’}
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
spark = SparkSession.builder.appName('text').getOrCreate()
df = spark.createDataFrame(xin)
df.replace({'ai':'AI', 'java':'php'}).show()
>>>
+---+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1| time|
+---+---+---+---+---+---+--------------------+
| AI|0.0| 5| 4| 5| 6|2022-08-06 18:01:...|
| AI|3.0| 3| 4| 5| 6|2022-08-06 18:01:...|
|php|NaN| 3| 4| 5| 6|2022-08-06 18:01:...|
+---+---+---+---+---+---+--------------------+
- union 相当于sql里的unionall
- unionByName 根据名字进行union,类似于pandas的concat
df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
df1.unionByName(df2).show()
# +----+----+----+
# |col0|col1|col2|
# +----+----+----+
# | 1| 2| 3|
# | 6| 4| 5|
# +----+----+----+
dataframe的一些列操作的算子,就不一一举例了,大家看下语法就会
Column.alias(*alias, **kwargs) # 重命名列名
Column.asc() # 按照列进行升序排序
Column.desc() # 按照列进行降序排序
Column.astype(dataType) # 类型转换
Column.cast(dataType) # 强制转换类型
Column.between(lowerBound, upperBound) # 返回布尔值,是否在指定区间范围内
Column.contains(other) # 是否包含某个关键词
Column.endswith(other) # 以什么结束的值,如 df.filter(df.name.endswith('ice')).collect()
Column.isNotNull() # 筛选非空的行
Column.isNull()
Column.isin(*cols) # 返回包含某些值的行 df[df.name.isin("Bob", "Mike")].collect()
Column.like(other) # 返回含有关键词的行
Column.when(condition, value) # 给True的赋值
Column.otherwise(value) # 与when搭配使用,df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()
Column.rlike(other) # 可以使用正则的匹配 df.filter(df.name.rlike('ice$')).collect()
Column.startswith(other) # df.filter(df.name.startswith('Al')).collect()
Column.substr(startPos, length) # df.select(df.name.substr(1,3).alias("col")).collect()
到这里,dataframe常用算子就介绍完了。以后再用到别的再继续补充吧
- 创建空dataframe
query_list = spark.createDataFrame(spark.sparkContext.emptyRDD(), query.schema)
- 提交任务
/opt/spark-3.2.2-bin-hadoop3.2/bin/spark-submit --master yarn --deploy-mode cluster --name ccc_halls --conf spark.pyspark.python=python3 --files /home/bi-2.0/bi-datahouse/conf/conf.int --py-files /home/bi-2.0/bi-datahouse/common/untils_conf.py,/home/bi-2.0/bi-datahouse/common/untils_logging.py --num-executors 10 --executor-memory 4g --executor-cores 4 --driver-memory 2g --conf spark.sql.execution.arrow.enabled=true --conf spark.kryoserializer.buffer.max=2008 /home/bi-2.0/bi-datahouse/ods/ods_arch2018_ccc_halls_2ha.py