箱体处理异常值
转载自:https://blog.csdn.net/zhuiqiuuuu/article/details/82721935
import pandas as pd
import numpy as np
from collections import Counter
def detect_outliers(df,n,features):
print("开始处理异常值")
outlier_indices = []
for col in features:
Q1 = np.percentile(df[col], 25)
Q3 = np.percentile(df[col],75)
IQR = Q3 - Q1
outlier_step = 1.5 * IQR
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
outlier_indices.extend(outlier_list_col)
outlier_indices = Counter(outlier_indices)
multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
return multiple_outliers
Outliers_to_drop = detect_outliers(data,0,['target_Kwh'])
data = data.drop(Outliers_to_drop,axis=0).reset_index(drop=True)
代码中n表示行异常的次数,即检查的行中,异常的column次数大于n,才被选定。