df_miss = spark.createDataFrame([(1,143.5,5.6,28,'M',100000),(2,167.2,5.4,45,'M',None),(3,None,5.2,None,None,None),(4,144.5,5.9,33,'M',None),(5,133.2,5.7,54,'F',None),(6,124.1,5.2,None,'F',None),(7,129.2,5.3,42,'M',76000),],['id','weight','height','age','gender','income'])
To find the number of missing observations per row we can use the following snippet.#统计每一行缺失的数据量
df_miss.rdd.map(lambda row:(row['id'],sum([c ==Nonefor c in row]))).collect()[(1,0),(2,1),(3,4),(4,1),(5,1),(6,2),(7,0)]
Let's see what values are missing so when we count missing observations in columns we can decide whether to drop the observation altogether or impute some of the observations.#第三行数据缺失有点多,来看一下第三行数据
df_miss.where('id == 3').show()+---+------+------+----+------+------+|id|weight|height| age|gender|income|+---+------+------+----+------+------+|3| null|5.2|null| null| null|+---+------+------+----+------+------+
What is the percentage of missing observations we see in each column?
#统计每列数据缺失情况
df_miss.agg(*[(1-(fn.count(c)/ fn.count('*'))).alias(c +'_missing')for c in df_miss.columns
]).show()+----------+------------------+--------------+------------------+------------------+------------------+|id_missing| weight_missing|height_missing| age_missing| gender_missing| income_missing|+----------+------------------+--------------+------------------+------------------+------------------+|0.0|0.1428571428571429|0.0|0.2857142857142857|0.1428571428571429|0.7142857142857143|+----------+------------------+--------------+------------------+------------------+------------------+