pyspark学习 - 悦读

spark入口

- dataframe的一些列操作的算子，就不一一举例了，大家看下语法就会

# 导入包
from pyspark.sql import SparkSession
import findspark
findspark.init()  # 据说这个包能够自动找到机器的spark路径，但实测后不好用
# 添加spark环境变量
os.environ['SPARK_HOME'] = "/Library/Hadoop/spark-3.0.0-bin-hadoop3.2"
#
# # Append pyspark to Python Path
sys.path.append("/Library/Hadoop/spark-3.0.0-bin-hadoop3.2/python/lib/py4j-0.10.9-src.zip")

spark = SparkSession.builder.appName('text').getOrCreate()  # 之前说过的sparksession入口
sp = [
    {'a':1,'b':'b','c':1.1},{'a':1,'b':'b','c':1.1}
]
df = spark.createDataFrame(sp)  # 创建dataframe
# df = pd.DataFrame(sp)  # 创建pandas的dataframe 
# print(df)

# 创建pandas 的dataframe，
xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
       'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
       'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
#
xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])


spark.createDataFrame(xin).show()  # pandas的dataframe转成spark的dataframe
df.createOrReplaceTempView('table1')  # dataframe转成临时表
spark.sql('select * from table1').show()  # 执行sql并打印

filter 筛选条件，类似sql的where

    xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
           'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
           'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.getOrCreate()
    df = spark.createDataFrame(xin)
    df.filter(df.d1>2).show()
》》》
+---+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1|                time|
+---+---+---+---+---+---+--------------------+
| ai|3.0|  3|  4|  5|  6|2022-08-05 19:00:...|
| ai|NaN|  4|  4|  5|  6|2022-08-05 19:00:...|
+---+---+---+---+---+---+--------------------+

-collect 以列表得形式返回行非常不建议

    xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
           'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
           'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.getOrCreate()
    df = spark.createDataFrame(xin)
    for i in df.collect():
        print(i)
        print(list(i))
》》》
Row(a1='ai', b1=0.0, c1=5, d1=2, e1='5', f1='6', time=datetime.datetime(2022, 8, 5, 19, 2, 44, 680251))
['ai', 0.0, 5, 2, '5', '6', datetime.datetime(2022, 8, 5, 19, 2, 44, 680251)]
Row(a1='ai', b1=3.0, c1=3, d1=4, e1='5', f1='6', time=datetime.datetime(2022, 8, 5, 19, 2, 44, 680251))
['ai', 3.0, 3, 4, '5', '6', datetime.datetime(2022, 8, 5, 19, 2, 44, 680251)]
Row(a1='ai', b1=nan, c1=4, d1=4, e1='5', f1='6', time=datetime.datetime(2022, 8, 5, 19, 2, 44, 680251))
['ai', nan, 4, 4, '5', '6', datetime.datetime(2022, 8, 5, 19, 2, 44, 680251)]

toLocalIterator 返回一个迭代器，占用的最大资源为rdd一个分区的资源，返回的和collect 一样。建议使用
count 略

    xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
           'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
           'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.getOrCreate()
    df = spark.createDataFrame(xin)
    print(df.count())
》》》
3

columns 列表得形式返回列名

    xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
           'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
           'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.getOrCreate()
    df = spark.createDataFrame(xin)
    print(df.columns)
》》》》
['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time']

dtypes 列表得形式返回列名和类型

    xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
           'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
           'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.getOrCreate()
    df = spark.createDataFrame(xin)
    print(df.dtypes)
》》》
[('a1', 'string'), ('b1', 'double'), ('c1', 'bigint'), ('d1', 'bigint'), ('e1', 'string'), ('f1', 'string'), ('time', 'timestamp')]

describe 返回列得基础信息

    xin = {'a': ['ai', 0, 5, 2, '5', '6', datetime.now()],
           'b': ['ai', 3, 3, 4, '5', '6', datetime.now()],
           'c': ['ai', np.NAN, 4, 4, '5', '6', datetime.now()]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.getOrCreate()
    df = spark.createDataFrame(xin)
    df.describe().show()
》》》》
+-------+----+---+---+------------------+---+---+
|summary|  a1| b1| c1|                d1| e1| f1|
+-------+----+---+---+------------------+---+---+
|  count|   3|  3|  3|                 3|  3|  3|
|   mean|null|NaN|4.0|3.3333333333333335|5.0|6.0|
| stddev|null|NaN|1.0|1.1547005383792515|0.0|0.0|
|    min|  ai|0.0|  3|                 2|  5|  6|
|    max|  ai|NaN|  5|                 4|  5|  6|
+-------+----+---+---+------------------+---+---+
    df = spark.createDataFrame(xin)
    df.describe(['a1']).show()
》》》
+-------+----+
|summary|  a1|
+-------+----+
|  count|   3|
|   mean|null|
| stddev|null|
|    min|  ai|
|    max|  ai|
+-------+----+

select 选取指定的列
sort 按照一定的顺序呈现

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.select('b1').show()
>>>
+---+
| b1|
+---+
|  0|
|  3|
|  2|
+---+

first() 查看第一条数据，返回rdd
head（n），查看前n条数据
freqItems 查看指定的列的枚举值
summary 在describe的基础上增加了固定百分比的展示

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.summary().show()
>>>
+-------+----+------------------+------------------+---+---+---+--------------------+
|summary|  a1|                b1|                c1| d1| e1| f1|                time|
+-------+----+------------------+------------------+---+---+---+--------------------+
|  count|   3|                 3|                 3|  3|  3|  3|                   3|
|   mean|null|1.6666666666666667|3.6666666666666665|4.0|5.0|6.0|                null|
| stddev|null|1.5275252316519468|1.1547005383792515|0.0|0.0|0.0|                null|
|    min|  ai|                 0|                 3|  4|  5|  6|2022-08-06 16:37:...|
|    25%|null|                 0|                 3|  4|5.0|6.0|                null|
|    50%|null|                 2|                 3|  4|5.0|6.0|                null|
|    75%|null|                 3|                 5|  4|5.0|6.0|                null|
|    max|java|                 3|                 5|  4|  5|  6|2022-08-06 16:37:...|
+-------+----+------------------+------------------+---+---+---+--------------------+

distinct 对数据去重
dropDuplicates 根据指定的列进行去重

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.dropDuplicates(['a1']).show()
>>>
+----+---+---+---+---+---+--------------------+
|  a1| b1| c1| d1| e1| f1|                time|
+----+---+---+---+---+---+--------------------+
|  ai|  0|  5|  4|  5|  6|2022-08-06 16:40:...|
|java|  2|  3|  4|  5|  6|2022-08-06 16:40:...|
+----+---+---+---+---+---+--------------------+

exceptAll 根据指定的df对df去重，相当于先union all，在去重,但是测试后发现有问题，具体执行逻辑有待研究

    df1 = spark.createDataFrame(
        [("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
    df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
    df3 = df1.exceptAll(df2)
    df1.show()
    df2.show()
    df3.show()
>>>
df1
+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  b|  3|
|  c|  4|
+---+---+
df2
+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  b|  3|
+---+---+
df3
+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  c|  4|
+---+---+

subtract根据指定的df对df去重，相当于去掉df2里出现过的df1里的任何数据，只留下没有出现的

    df1 = spark.createDataFrame(
        [("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
    df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
    df3 = df1.subtract(df2)
    df1.show()
    df2.show()
    df3.show()
>>>
+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  b|  3|
|  c|  4|
+---+---+

+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  b|  3|
+---+---+

+---+---+
| C1| C2|
+---+---+
|  c|  4|
+---+---+

intersectAll 返回两个df的交集

    df1 = spark.createDataFrame(
        [("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
    df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
    df3 = df1.intersectAll(df2)
    df1.show()
    df2.show()
    df3.show()
>>>
+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  b|  3|
|  c|  4|
+---+---+

+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  b|  3|
+---+---+

+---+---+
| C1| C2|
+---+---+
|  b|  3|
|  a|  1|
+---+---+

drop 删除指定列

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.drop('a1').show()
>>>
+---+---+---+---+---+--------------------+
| b1| c1| d1| e1| f1|                time|
+---+---+---+---+---+--------------------+
|  0|  5|  4|  5|  6|2022-08-06 16:54:...|
|  3|  3|  4|  5|  6|2022-08-06 16:54:...|
|  2|  3|  4|  5|  6|2022-08-06 16:54:...|
+---+---+---+---+---+--------------------+

withColumn 新增列

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.withColumn('g1', df.b1 * 10).show()
>>>
+----+---+---+---+---+---+--------------------+---+
|  a1| b1| c1| d1| e1| f1|                time| g1|
+----+---+---+---+---+---+--------------------+---+
|  ai|  0|  5|  4|  5|  6|2022-08-06 16:56:...|  0|
|  ai|  3|  3|  4|  5|  6|2022-08-06 16:56:...| 30|
|java|  2|  3|  4|  5|  6|2022-08-06 16:56:...| 20|
+----+---+---+---+---+---+--------------------+---+

withColumnRenamed 重命名列

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', 2, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.withColumnRenamed('a1','A1').show()
>>>
+----+---+---+---+---+---+--------------------+
|  A1| b1| c1| d1| e1| f1|                time|
+----+---+---+---+---+---+--------------------+
|  ai|  0|  5|  4|  5|  6|2022-08-06 16:57:...|
|  ai|  3|  3|  4|  5|  6|2022-08-06 16:57:...|
|java|  2|  3|  4|  5|  6|2022-08-06 16:57:...|
+----+---+---+---+---+---+--------------------+

dropna 丢弃空值，类似于pandas的dropna,而且，连np.NAN也能识别，这算是一个很好的交互了

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.dropna(how='all', subset=['b1']).show()
>>>
+---+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1|                time|
+---+---+---+---+---+---+--------------------+
| ai|0.0|  5|  4|  5|  6|2022-08-06 17:00:...|
| ai|3.0|  3|  4|  5|  6|2022-08-06 17:00:...|
+---+---+---+---+---+---+--------------------+

fillna 填充空值，类似于pandas的fillna

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.fillna({'b1':2}).show()
>>>
+----+---+---+---+---+---+--------------------+
|  a1| b1| c1| d1| e1| f1|                time|
+----+---+---+---+---+---+--------------------+
|  ai|0.0|  5|  4|  5|  6|2022-08-06 17:03:...|
|  ai|3.0|  3|  4|  5|  6|2022-08-06 17:03:...|
|java|2.0|  3|  4|  5|  6|2022-08-06 17:03:...|
+----+---+---+---+---+---+--------------------+

join 类似pandas的merge，不过多赘述了

    df1 = spark.createDataFrame(
        [("a", 1), ("d", 1), ("b", 3), ("c", 4)], ["id", "num1"])
    df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["id", "num2"])
    df1.join(df2, df1.id == df2.id, 'left').select(df1.id.alias("df1_id"),
                                                   df1.num1.alias("df1_num"),
                                                   df2.num2.alias("df2_num")
                                                   ).sort(["df1_id"], ascending=False) \
        .show()
>>>
+------+-------+-------+
|df1_id|df1_num|df2_num|
+------+-------+-------+
|     d|      1|   null|
|     c|      4|   null|
|     b|      3|      3|
|     a|      1|      1|
+------+-------+-------+

agg(*exprs) 聚合类，可以写多个方法，通常配合groupBy使用
alias 对列或df设置别名
groupBy 分组

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.groupBy(['a1']).agg(functions.min(df.c1).alias('three'),functions.expr('avg(d1)').alias('avg')).show()
>>>
+----+-----+---+
|  a1|three|avg|
+----+-----+---+
|  ai|    3|4.0|
|java|    3|4.0|
+----+-----+---+

foreach 类似于pandas的apply，对每一行执行方法,但是实测没有apply强大，首先apply能对行操作，也能对列操作，foreach只能对行，其次，foreach不能改变其中的内容，

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.foreach(lambda x: print(x.a1))
>>>
java
ai
ai

replace,替换，传入字典，{‘old’:‘new’}

    xin = {'a': ['ai', 0, 5, 4, '5', '6', str(datetime.now())],
           'b': ['ai', 3, 3, 4, '5', '6', str(datetime.now())],
           'c': ['java', np.NAN, 3, 4, '5', '6', str(datetime.now())]}
    #
    xin = pd.DataFrame.from_dict(xin, orient='index', columns=['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'time'])
    spark = SparkSession.builder.appName('text').getOrCreate()
    df = spark.createDataFrame(xin)
    df.replace({'ai':'AI', 'java':'php'}).show()
>>>
+---+---+---+---+---+---+--------------------+
| a1| b1| c1| d1| e1| f1|                time|
+---+---+---+---+---+---+--------------------+
| AI|0.0|  5|  4|  5|  6|2022-08-06 18:01:...|
| AI|3.0|  3|  4|  5|  6|2022-08-06 18:01:...|
|php|NaN|  3|  4|  5|  6|2022-08-06 18:01:...|
+---+---+---+---+---+---+--------------------+

union 相当于sql里的unionall
unionByName 根据名字进行union，类似于pandas的concat

df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
df1.unionByName(df2).show()
# +----+----+----+
# |col0|col1|col2|
# +----+----+----+
# |   1|   2|   3|
# |   6|   4|   5|
# +----+----+----+

dataframe的一些列操作的算子，就不一一举例了，大家看下语法就会

Column.alias(*alias, **kwargs)  # 重命名列名
Column.asc()  # 按照列进行升序排序
Column.desc()  # 按照列进行降序排序
Column.astype(dataType)  # 类型转换
Column.cast(dataType)  # 强制转换类型
Column.between(lowerBound, upperBound)  # 返回布尔值，是否在指定区间范围内
Column.contains(other)  # 是否包含某个关键词
Column.endswith(other)  # 以什么结束的值，如 df.filter(df.name.endswith('ice')).collect()
Column.isNotNull()  # 筛选非空的行
Column.isNull()
Column.isin(*cols)  # 返回包含某些值的行 df[df.name.isin("Bob", "Mike")].collect()
Column.like(other)  # 返回含有关键词的行
Column.when(condition, value)  # 给True的赋值
Column.otherwise(value)  # 与when搭配使用，df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()
Column.rlike(other)  # 可以使用正则的匹配 df.filter(df.name.rlike('ice$')).collect()
Column.startswith(other)  # df.filter(df.name.startswith('Al')).collect()
Column.substr(startPos, length)  # df.select(df.name.substr(1,3).alias("col")).collect()

到这里，dataframe常用算子就介绍完了。以后再用到别的再继续补充吧

创建空dataframe

query_list = spark.createDataFrame(spark.sparkContext.emptyRDD(), query.schema)

提交任务

/opt/spark-3.2.2-bin-hadoop3.2/bin/spark-submit --master yarn --deploy-mode cluster --name ccc_halls --conf spark.pyspark.python=python3 --files /home/bi-2.0/bi-datahouse/conf/conf.int --py-files /home/bi-2.0/bi-datahouse/common/untils_conf.py,/home/bi-2.0/bi-datahouse/common/untils_logging.py   --num-executors 10 --executor-memory 4g --executor-cores 4 --driver-memory 2g --conf spark.sql.execution.arrow.enabled=true --conf spark.kryoserializer.buffer.max=2008   /home/bi-2.0/bi-datahouse/ods/ods_arch2018_ccc_halls_2ha.py