sklearn语法模块汇总
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
【集成学习】sklearn中xgboost模块的XGBClassifier函数
booster:gbtree时,树模型为基分类器(默认);gbliner时, 线性模型为基分类器.
silent :silent=0时,不输出中间过程(默认); silent=1时,输出中间过程.
nthread: nthread=-1时,使用全部CPU进行并行运算(默认) nthread=1时,使用1个CPU进行运算。
scale_pos_weight:正样本的权重,在二分类任务中,当正负样本比例失衡时,设置正样本的权重,模型效果更好。例如,当正负样本比例为1:10时,scale_pos_weight=10。
n_estimatores:总共迭代的次数,即决策树的个数
early_stopping_rounds :含义:在验证集上,当连续n次迭代,分数没有提高后,提前终止训练。调参:防止overfitting。
max_depth:树的深度,默认值为6,典型值3-10。 调参:值越大,越容易过拟合;值越小,越容易欠拟合。
min_child_weight:默认值为1。 调参:值越大,越容易欠拟合;值越小,越容易过拟合(值较大时,避免模型学习到局部的特殊样本)。
subsample:训练每棵树时,使用的数据占全部训练集的比例。默认值为1,典型值为0.5-1。调参:防止overfitting。
colsample_bytree:训练每棵树时,使用的特征占全部特征的比例。默认值为1,典型值为0.5-1。调参:防止overfitting。
gamma:惩罚项系数,指定节点分裂所需的最小损失函数下降值。调参:alpha L1正则化系数,默认为1
lambda: L2正则化系数,默认为1
learning_rate:学习率,控制每次迭代更新权重时的步长,默认0.3。调参:值越小,训练越慢。典型值为0.01-0.2。0.1左右就很好。
objective 目标函数
回归任务
reg:linear (默认) reg:logistic
二分类 binary:logistic 概率 binary:logitraw 类别
多分类 multi:softmax num_class=n 返回类别 multi:softprob num_class=n 返回概率
rank:pairwise eval_metric
回归任务(默认rmse) rmse--均方根误差 mae--平均绝对误差
分类任务(默认error) auc--roc曲线下面积 error--错误率(二分类) merror--错误率(多分类)
logloss--负对数似然函数(二分类) mlogloss--负对数似然函数(多分类)
data = pd.DataFrame({'label': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
'pred': [0.5, 0.6, 0.7, 0.6, 0.6, 0.8, 0.4, 0.2, 0.1, 0.4, 0.3, 0.9]})
import pandas as pd
from psi import psi
on = ['name', 'idcard_md5', 'phone_md5', 'loan_dt'] on = ['name', 'idcard', 'phone', 'loan_dt']
小数: print('percent: %.2f'%(42/50) print('percent: {:.2f}'.format(42/50))
百分数: print('percent: {:.2%}'.format(42/50)) print('percent: {:.2f}%'.format(42/50*100))
dfpsi = pd.DataFrame([np.nan], columns=['psi'])
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
df = pd.DataFrame(data=['Apple','Banana','Cherry','Dates', 'Eggfruit'], index = [1,2,3,4,5],columns=['Fruits'])
pd.DataFrame(fruits_list)
pd.DataFrame({ 'Fruits':['Apple','Banana','Cherry','Dates','Eggfruit'],'Quantity': [5, 10, 8, 3, 7], 'Color': ['Red', 'Yellow', 'Red', 'Brown', 'Yellow']})
import pandas as pd
df =pd.read_csv('zhongxin_1206.txt',sep='\t') df =pd.read_csv('zhongxin_1206.txt',sep=',') fruits = pd.read_excel('fruits.xlsx',sheet_name='Sheet1')
df = pd.read_csv(path, sep=',', names=columns_name)
ranking_name = ['name', 'idcard_md5', 'phone_md5', 'loan_dt', 'score', 'resource_id', 'customer', 'feature_new']
df = pd.read_csv('rong360v1_20210815.txt', header=None, names=ranking_name, sep='\t')
df = pd.read_csv('rong360_fq1_10w.txt', sep='\t',usecols=['name', 'idcard_md5', 'phone_md5', 'loan_dt','n21_score'])
df = pd.read_csv("1217_1out.csv",usecols=[i for i in range(1,10000)])
data = pd.read_csv('data.csv',nrows =5)
df = df[['name', 'idcard_md5', 'phone_md5', 'loan_dt']]
pd.DataFrame(miss_data1, index=list_score1,columns= 'shape').round(4)
df.to_csv('zhongxin_1208.txt', index=False, sep='\t',encoding='utf-8')
df.rename(columns={'order': 'id'}, inplace=True)
df = df.rename(index={"TianJin": "tj", "ShangHai": "sh"}, columns={"a": "A"})
df = df.rename(columns={"n21_score": "n21_score_tmp", "n21_score_new_empty": "n21_score"})
df = df.rename(columns=lambda c: c + "_test")
df = df.rename(columns={"name_sha256": "name", "identity_id_sha256": "idcard","phone_sha256":"phone","apply_dt": "loan_dt"})
df.drop(['id', 'phone'], axis=1, inplace=True)
df = df.drop(columns='A')
df.drop(df.index[0], inplace=True)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(df.columns[0:3], axis=1, inplace=True)
df.drop(['label_x'], axis=1, inplace=True)
df = df_new.drop(df[df.label == -1].index)
df.drop(df.columns[[0, 4, 8]], axis=1, inplace=True)
del df['密度']
df.drop_duplicates(subset=['name','idcard','phone','loan_dt'], inplace=True)
df.drop_duplicates(subset=['name','idcard_md5','phone_md5','loan_dt'], inplace=True)
df.drop_duplicates(subset=['A','B','C'],keep=Fasle,inplace=True)
print(df.dropna(axis='columns', thresh=5))
df.sort_values(by='排序字段', axis=0, ascending=True, inplace=False, na_position='last')
df.sort_values(by='loan_dt',inplace=True,ascending=False)
df = df.sort_values(by='y_pred')
df = df.reset_index(drop=True)
result = result.reset_index(drop=True)
df.dropna(axis=0, how='any', inplace=True)
df = pd.merge(df1,df2, how='inner', on=['name', 'idcard', 'phone', 'loan_dt'])
df=df1.merge(df2[['query_name','query_iden_num','query_mbl_num','etl_dt', 'credit_score']],left_on=['name-正常','iden_num-正常','phone_num-正常','etl_dt-正常'],right_on=['query_name','query_iden_num','query_mbl_num','etl_dt'],how='left')
df_inner[['score2','score']] = df_inner[['score2', 'score']].apply(pd.to_numeric, errors='ignore')
test_x = np.array(df_test_x)
df = pd.DataFrame(test_x)
train, test = res.iloc[:90131], res.iloc[90131:]
test = N_data.iloc[45000:]['n30_hj_score'].to_list()
df.loc[:, 'a']= df.loc[:, 'b']
df['sum'] = df.loc[df['x'] > 0,['x','y']].sum(axis=1)
df.loc[['a', 'f']]
df = df[:-1,:-2]
print('month:', df['loan_dt'].str[:7].unique())
df['label'].value_counts()
print(df_raw['v30_dz_huisuV'].isnull().sum(axis=0))
print('Vscore缺失率','{:.2%}'.format(df_raw['v30_dz_huisuV'].isnull().sum(axis=0) / df_raw.shape[0]))
df_train.drop(df_train[(df_train['OverallQual']<5) & (df_train['SalePrice']>200000)].index,inplace=True)
df_data = df_data[df_data['loanamount']<1000000]
df['sl']=df['sl'].mask(df['flag']==1,df['sl_new'])
df['flag']=1
df=[df['flag'].notnull()]
df.select_dtypes(include=['float64'])
df_all = df[['name', 'idcard', 'phone','idcard_rsa','phone_rsa','loan_dt']].copy(deep=True)
df_all = df[['name', 'idcard', 'phone','idcard_rsa','phone_rsa','loan_dt']].copy()
df=pd.concat([df1, df2])
df=pd.concat([df1, df2], axis=1)
df_igno_idx = pd.concat([df_aa,df_zz], ignore_index=True)
V_test.insert(5, 'n30_hj_score_new', res, allow_duplicates=True)
V_N_data1 = df_raw.sample(n=df_raw.shape[0]-5000, random_state=1)
df['feature_new'].replace('\\N', np.nan,inplace=True)
df['feature_new'].replace("%","").astype("float"))
items_df["Cost"] = pd.to_numeric(items_df["Cost"].str.replace('$', ''))
result['prediction'].replace([0,1,2,3,4,5,6,7,8],[870,870,880,898,1300,13117,13298,13690,13691],inplace=True)
data['v'].replace(["\\N",0,'0','None','null'],np.nan,inplace=True)
s.replace(0, 5) df.replace([0, 1, 2, 3], 4) df.replace({0: 10, 1: 100}) df.replace({'A': {0: 100, 4: 400}})
df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True)
df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})
df.replace(regex=[r'^ba.$', 'foo'], value='new')
fq1 = fq1[fq1['phone_md5'].notnull()]
df = df[df['feature_new'] != '\\N']
df.drop(index=df[df['A'].isin([4])].index[0])
df.drop(index=df[df['A'] == 4].index[0])
df[df.isin({'D':[0,3],'E':['aa','cc']})]
df[df.isin({'D':[0,3],'E':['aa','cc']})]
import datetime
df['loan_dt'] = pd.to_datetime(df['loan_dt']).apply(lambda x: datetime.datetime.strftime(x,"%Y-%m-%d"))
print(df.Math_B.apply(lambda x: x * 2))
def x(a,b):
return a - b
df['d - a'] = df.apply(lambda f: x(f['d'],f['a']), axis=1)
employees_df["Age"]=employees_df["Age"].apply(str)
df['HIS_DW_DATE'] = df['HIS_DW_DATE'].apply(int)
df['Full Name'] = df[['First', 'Last']].apply(' '.join, axis=1)
df['Full Name'] = df['First'].str.cat(df['Last'],sep=" ")
df['Full Name'] = df[['First', 'Last']].agg(' '.join, axis=1)
df[df.label.str.startwith('199')]
df.apply(np.sum, axis=1)
df.apply(np.sqrt)
df.filter(regex='e$', axis=1)
df.filter(regex='^e', axis=1)
df.filter(like='201',axis=1)
df.filter(regex ='[aA]')
df = df[df.columns.drop(list(df.filter(regex='e$')))]
df=df[[c for c in df.columns if c.lower()[:1] != 'e']]
df.select(lambda x: not re.search('Test\d+', x), axis=1)
df.filter(like='bbi', axis=0)
data['result'] = data['result'].map(lambda x: x.lstrip('+-').rstrip('aAbBcC'))
最后一个字符:data['result'] = data['result'].map(lambda x: str(x)[:-1])
前两个字符:data['result'] = data['result'].map(lambda x: str(x)[2:])
data['result'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
print ('Salary: %s' % tinydict.get('Salary', 0.0))
print ('Salary: %s' % tinydict.get('Salary'))
temp['year'] = pd.to_datetime(temp['Date']).dt.year
print(len(data_te['v30_dz'].unique()), len(data_te['n30_hj_score'].unique()), len(data_te['n30_hj_score_new'].unique()))
df = df[~(df['y2'].isin([2]) | df['y1'].isin([2]) | df['y3'].isin([2]) | df['y4'].isin([2]))]
V_test = V_N_raw[~V_N_raw['phone'].isin(V_N_data['phone'].values.tolist())]
df[(df['label'] == 1) & (df['month'] == '2021-01')].shape
df[[x.startswith('张') for x in df['姓名']]]
altered_series=my_series.map(lambda x: str(x)+".00")
df = data[(data['a']<=2) | (data['b']>=5)]
tmp = tmp[tmp['year'].isin([1950, 1960, 1970, 1980, 1990, 2000, 2010])]
groups = df.groupby(by='month')
print(groups.get_group('2021-05'))
agg = groups['n21_score'].agg([np.sum, np.mean, np.std])
groups = df.groupby('Team')
score = lambda x: (x - x.mean()) / x.std()*10
print (groups.transform(score))
df= df.groupby('Team').filter(lambda x: len(x) >= 3)
for name, group in groups:
print(name,group)
print('{:.2%}'.format(group['n21_score'].isnull().sum(axis=0) /group.shape[0]))
target_mean_dict = df_train.groupby(col)[target_col].mean()
df_val[f'{col}_mean_target'] = df_val[col].map(target_mean_dict)
y_true, y_pred = np.array(y_true).reshape(-1), np.array(y_pred).reshape(-1)
df['data'].groupby(time_list).sum()
res = (np.where(np.isnan(df['n30_hj_score_new_empty']) & np.isnan(df['cy20_hj_score']), 1, 0))
print(df.columns[df.isnull().sum() > 0])
full.isnull().sum()[full.isnull().sum()>0]
for column in list(df.columns[df.isnull().sum() > 0]):
df[column].fillna(df[column].mean(), inplace=True)
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32],bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
pd.cut(df_f.积分,bins=3,labels=["低","中","高"])
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats1 = pd.cut(ages, bins, labels=group_names)
aa = pd.value_counts(cats)
s = pd.Series(np.random.randn(1000))
cats = pd.qcut(s, 4)
a = pd.value_counts(cats)
np.where(condition, x, y)
print(min(train['loan_dt']), max(train['loan_dt']))
N_new1["N_range1"] = pd.cut(x=N_new1["n21_score"], bins=[300, 400, 500, 600, 700, 800, 900])
sep_date = df1['loan_dt'][int(df1.shape[0] * 0.8)]
before = df1[df1['loan_dt'] < sep_date]
temp = temp.groupby(['year', 'Mean_TemperatureC']).agg({'Mean_TemperatureC': 'count'}).rename(columns={'Mean_TemperatureC': 'count'}).reset_index()
dftrain = pd.DataFrame(train_data, columns=['x']).dropna().sort_values('x')
dftrain.index = range(len(dftrain))
df_test.columns = df_test.columns.map(lambda x:x.decode('utf-8'))
features = df_test[[col for col in model.features if col in df_test.columns]]
df_test['diff'] = [round(x, 10) for x in df_test['diff']]
objectList=[]
classList=[]
numericalList=[]
for i in train_data.columns:
if train_data[i].dtype=='O':
objectList.append(i)
for i in list(train_data.select_dtypes(exclude=['object']).columns):
temp=train_data[i].unique()
if len(temp)<=10:
classList.append(i)
else:
numericalList.append(i)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['OPBAL_FLAG']=le.fit_transform(df['OPBAL_FLAG'])
df['CLSBAL_FLAG']=le.fit_transform(df['CLSBAL_FLAG'])
quantile_points = [dftrain['x'][int(np.ceil(float(indexmax * i) / parts))] for i in range(0, parts + 1)]
cut_points = list(pd.Series(quantile_points).drop_duplicates().values)
train_frequencies = __get_hist(list(dftrain['x'].values), cut_points)
psi_value = sum([(testf[i] - trainf[i]) * math.log(testf[i] / trainf[i]) for i in range(len(testf))])
print('%.4f' % (psi(df_inner['score'], df_inner['score2'])))
result_psi2 = psi_analysis(n21_old_ls, n21_new_ls, parts=10)
print(result_psi2.columns)
print(result_psi2.values)
corr_te = data_te[['n21_score_new', 'n21_score', 'v30_dz_huisuV']].corr(method='spearman')
df_corr = pd.DataFrame(corr_te)
print(df_corr)
import re
print(float(re.split(r'[\"]?([0-9\.]*)[\"]?','1151226468812.22')[1]))
print(float(re.split(r'[\"]?([0-9\.]*)[\"]?','"1151226468812.22"')[1]))
for col in train.select_dtypes(include=['object']).columns:
print ("Column {} has {} unique instances".format( col, len(train[col].unique())) )
matplotlib画图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig1 = plt.figure(figsize=(5, 5), dpi=100)
plt.title('HBXJ产品V和N_new关系散点图', fontsize='xx-large')
plt.scatter(v30_dz_huisuV_ls, n21_new_ls, s=2)
plt.xlabel('V')
plt.ylabel('N_new')
corr_val = "%.6f" % (corr_te.loc['n21_score_new', 'v30_dz_huisuV'])
font = {'family': 'serif', 'style': 'italic', 'weight': 'normal', 'color': 'red', 'size': 20}
ax = plt.gca()
plt.text(0.25, 0.8, r'Pearson: ' + str(corr_val), fontdict=font, transform=ax.transAxes)
plt.show()
corr_df = employees_df.corr(method='pearson')
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.heatmap(corr_df, annot=True)
plt.show()
corr_df = employees_df.corr(method='pearson')
corr_df.style.background_gradient(cmap='coolwarm')
plt.matshow(corr_df)
plt.show()
fig6 = plt.figure(figsize=(6, 5), dpi=100)
plt.title('HBXJ产品验证集N_old和N_new密度图', fontsize='xx-large')
p1 = sns.kdeplot(data_te['n21_score'], shade=True, color="r", label='n21_score')
p1 = sns.kdeplot(data_te['n21_score_new'], shade=True, color="b", label='n21_score_new')
plt.ylabel('density')
plt.xlabel('n21_score + n21_score_new')
plt.legend(loc=2)
psi = "%.6f" % (result_psi2.values[0][0])
font = {'family': 'serif','style': 'italic','weight': 'normal','color': 'red','size': 20}
ax = plt.gca()
plt.text(0.02, 0.8, r'PSI: ' + str(psi), fontdict=font, transform=ax.transAxes)
plt.show()
dis_cols=6
dist_rows=len(numericalList)
plt.figure(figsize=(4*dis_cols,4*dist_rows))
i=1
for col in numericalList:
ax=plt.subplot(dist_rows,dis_cols,i)
ax=sns.kdeplot(train_data[col],color='Red',shade=True)
ax=sns.kdeplot(test_data[col],color='Blue',shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax=ax.legend(["train","test"])
i+=1
plt.show()
objectList=[]
classList=[]
numericalList=[]
for i in train_data.columns:
if train_data[i].dtype=='O':
objectList.append(i)
for i in list(train_data.select_dtypes(exclude=['object']).columns):
temp=train_data[i].unique()
if len(temp)<=10:
classList.append(i)
else:
numericalList.append(i)
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
category_fea = list(train.select_dtypes(include=['object']).columns)
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
missing = train.isnull().sum()
missing = missing[missing > 0]
missing_rate = missing/len(train)
missing_rate.plot.bar()
missingDf = data.isnull().sum().sort_values(ascending=False).reset_index()
missingDf.columns = ['feature', 'miss_num']
missingDf['miss_percentage'] = missingDf['miss_num'] / data.shape[0]
one_value_fea = [col for col in train.columns if train[col].nunique() <= 1]
one_value_fea_test = [col for col in testA.columns if testA[col].nunique() <= 1]
f = pd.melt(train, value_vars=numerical_fea)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts()
column=data.columns.tolist()
fig=plt.figure(figsize=(80,60),dpi=75)
for i in range(len(column)):
plt.subplot(7,8,i+1)
sns.boxplot(data[column[i]].tolist(),orient="v",width=0.5)
plt.ylabel(column[i],fontsize=36)
plt.show()
def find_outliers_by_3segama(data,fea):
data_std = np.std(data[fea])
data_mean = np.mean(data[fea])
outliers_cut_off = data_std * 3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
return data
for fea in column:
data_train = find_outliers_by_3segama(train_data,fea)
print(train_data[fea+'_outliers'].value_counts())
print(train_data.groupby(fea+'_outliers')['isDefault'].sum())
print('*'*10)
train_data = train_data[train_data[fea+'_outliers']=='正常值']
train_data = train_data.reset_index(drop=True)
train_data.drop([fea+'_outliers'],axis=1,inplace=True)
train_data.info()
from sklearn.metrics import roc_curve
y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
y_true = [0, 1, 1, 0, 1, 0, 1, 1, 0, 1]
FPR,TPR,thresholds=roc_curve(y_true, y_pred)
plt.title('ROC')
plt.plot(FPR, TPR,'b')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
import pandas as pd
import numpy as np
import datetime
from CalPSI import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
from CalPSI import *
import matplotlib.pyplot as plt
import seaborn as sns
def plot_kde_psi(f_online, f_offline, left_on, right_on, model_score_on, model_score_off):
df1 = pd.read_csv(f_online, sep='\t')
df2 = pd.read_csv(f_offline, sep='\t')
for i in ['loan_dt', 'apply_dt']:
if i in df1.columns:
df1[i] = pd.to_datetime(df1[i]).apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"))
if i in df2.columns:
df2[i] = pd.to_datetime(df2[i]).apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"))
print(df1.head(), df2.head(), df1.columns,df2.columns)
df = df1.merge(df2, how='inner', left_on=left_on, right_on=right_on)
print(df.shape, df, df.columns)
print('线上覆盖率: {:.2%}'.format(1 - df[model_score_on].isnull().sum() / df.shape[0]),
'线下覆盖率: {:.2%}'.format(1 - df[model_score_off].isnull().sum() / df.shape[0]))
print(df[model_score_on].dtype,df[model_score_off].dtype)
df[model_score_on].replace(["\\N",0,'0','None','null'],np.nan,inplace=True)
df[model_score_off].replace(["\\N",0,'0','None','null'],np.nan,inplace=True)
score_online_ls = df[model_score_on].values.tolist()
score_offline_ls = df[model_score_off].to_list()
print(df[model_score_on].describe,df[model_score_off].describe,)
print(score_offline_ls,'\n',score_online_ls)
result_psi2 = psi_analysis(score_online_ls, score_offline_ls, parts=10)
print('样本的score-psi:{:.6f}'.format(result_psi2.values[0][0]))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(7, 5), dpi=100)
plt.title(str(model_score_off) + '_密度图', fontsize='xx-large')
sns.distplot(df[model_score_on], bins=30, rug=False, kde_kws={"label": "model_score_online"}, color="r")
sns.distplot(df[model_score_off], bins=30, rug=False, kde_kws={"label": "model_score_offline"},
color="b")
plt.ylabel('density')
plt.xlabel('线上线下分布对比')
plt.legend()
ax = plt.gca()
psi = '{:.6f}'.format(result_psi2.values[0][0])
font = {'family': 'serif', 'style': 'italic', 'weight': 'normal', 'color': 'red', 'size': 20}
plt.text(0.1, 0.7, r'PSI: ' + str(psi), fontdict=font, transform=ax.transAxes)
plt.show()
plot_kde_psi('fql_cy21xj_1k.txt', 'fql_cy21xj_10w.txt',
['name','phone','idcard','loan_dt'],
['name', 'phone', 'idcard', 'loan_dt'],
'cy21_xj_score_x', 'cy21_xj_score_y')
def drop_col(df, cutoff=0.1):
n = len(df)
cnt = df.count()
cnt = cnt / n
return df.loc[:, cnt[cnt >= cutoff].index]
df = drop_col(df)
print(df)
def sample_online_1k():
df=pd.read_csv('FQL_key_0111.txt',sep='\t')
df.drop_duplicates(subset=['name','idcard','phone','loan_dt'],inplace=True)
df.sort_values(by='loan_dt',inplace=True,ascending=False)
print(df,df.iloc[:1000,:])
df.iloc[:1000,:].to_csv('fql_new_1k.txt', sep='\t', index=False, encoding='utf-8')
sample_online_1k()
def func():
import os
print(os.getcwd())
files = os.listdir(path)
df1 = pd.read_csv(path + '/' + files[0],encoding='gbk')
for file in files[1:]:
df2 = pd.read_csv(path +'/'+file,encoding='gbk')
df1 = pd.concat([df1,df2],axis=0,ignore_index=True)
df1 = df1.drop_duplicates()
df1 = df1.reset_index(drop=True)
df1.to_csv(path + '/' + 'total.csv')
def func():
print(str(list(df1[['loan_dt']].count())[0]))
print(str(list(df1[['label']].sum())[0]))
a = df1[['label']].groupby(df1['loan_dt']).count()
b = df1[['label']].groupby(df1['loan_dt']).sum()
c = pd.concat([a, b], axis=1)
c.columns = ['cnt', 'ovd']
c['ovd_ratio'] = c['ovd'] * 1.0 / c['cnt']
def get_pri_key(path,new_path):
import pandas as pd
rankings_colname=['name','idcard_md5','phone_md5','loan_dt']
df = pd.read_csv(path, header=None, names=rankings_colname, sep='\t', usecols=[0, 1, 2, 3])
print(df.shape)
df.to_csv(new_path, sep='\t', index=False, encoding='utf-8')
import pandas as pd
import json
def dict_to_df(file, file_new):
rankings_colname = ['name', 'idcard_md5', 'phone_md5', 'loan_dt', 'score', 'resource_id', 'customer', 'feature_new', 'feature_old']
df = pd.read_csv(file, header=None, names=rankings_colname, sep='\t')
print(df.shape, '\n', df.head())
print(df.shape, '\n', df.head())
feature_new_list = df['feature_new'].apply(lambda x: eval(x)).tolist()
file_name1 = 'feature_new_'+str(file)+'.json'
with open(file_name1, 'w+') as f1:
json.dump(feature_new_list, f1)
feature_new = pd.read_json(file_name1, orient='records')
print(feature_new.shape, '\n', feature_new.head())
feature_old_list = df['feature_old'].apply(lambda x: eval(x)).tolist()
file_name2 = 'feature_old_' + str(file) + '.json'
with open(file_name2, 'w+') as f2:
json.dump(feature_old_list, f2)
feature_old = pd.read_json(file_name2, orient='records')
df2 = pd.concat([df.iloc[:, :-2], feature_new, feature_old], axis=1)
print(df2.shape, df2.head())
df.drop_duplicates(subset=['name', 'idcard_md5', 'phone_md5', 'loan_dt'], inplace=True)
print(df2.shape, df2.head())
df2.to_csv(file_new, sep='\t', index=False, encoding='utf-8')
return feature_new
import pandas as pd
import datetime
def get_month_miss(file):
df = pd.read_csv(file, sep='\t')
df.drop_duplicates(subset=['idcard_md5'], inplace=True)
print(df.shape)
print("n30_hj_score缺失率", '{:.2%}'.format(df['n30_hj_score'].isnull().sum(axis=0) / df.shape[0]),
"v30_dz缺失率",'{:.2%}'.format(df['v30_dz'].isnull().sum(axis=0) / df.shape[0]))
df['loan_dt'] = pd.to_datetime(df['loan_dt']).apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"))
df['month'] = df['loan_dt'].str[:7]
groups = df.groupby(by='month')
for name, group in groups:
print(name, '\t', group.shape[0], '\t',
'{:.2%}'.format(group['n30_hj_score'].isnull().sum(axis=0) / group.shape[0]), '\t',
'{:.2%}'.format(group['v30_dz'].isnull().sum(axis=0) / group.shape[0]), '\t',
)
def diff_cnt():
df = pd.read_csv(r"C:\Users\hongshaofeng\Desktop\XL_all.txt", sep='\t')
print(df.shape, df.head())
df["N_range1"] = pd.cut(x=df["Nscore"], bins=[300, 400, 500, 600, 700, 800, 900])
df["N_range2"] = pd.cut(x=df["Nscore_new"], bins=[300, 400, 500, 600, 700, 800, 900])
def function(a, b):
if a == b:
return 0
else:
return 1
df['bool'] = df.apply(lambda x: function(x['N_range1'], x['N_range2']), axis=1)
print('%.2f%%' % (df['bool'].sum()/df.shape[0]*100))
print('{:.4f}'.format(df['bool'].sum()/df.shape[0]))
df.to_csv('test.csv',encoding='utf-8',sep='\t',index=False)
def fea_map(file, model_lib_fea, fea_map, file_new):
df = pd.read_csv(file, sep='\t')
print(df.shape, df.head())
res = df.columns.to_list()
print(res)
res_lib = []
with open(model_lib_fea, 'r') as f:
for line in f:
res_lib.append(line.strip('\n'))
df2 = pd.read_csv(fea_map, header=None, names=['new_fea', 'fea'], sep='\t')
res_map1 = df2['new_fea'].to_list()
res_map2 = df2['fea'].to_list()
for i in res:
if i in res_map2:
res[res.index(i)] = res_map1[res_map2.index(i)]
print(len(res), res)
df.columns = res
print(df.columns)
df.to_csv(file_new, encoding='utf-8', sep='\t', index=False)
def jiemi():
df1 = pd.read_csv('wjwl_20210902_fea.txt', sep='\t')
print(df1.shape, df1.head())
df1.drop_duplicates(subset=['name', 'idcard_md5', 'phone_md5', 'loan_dt'], inplace=True)
print(df1.shape, df1.head())
df1.to_csv('wjwl_20210902_fea_new.txt', encoding='utf-8', sep='\t', index=False)
def rand_sample(file,file_new):
import pandas as pd
df_raw = pd.read_csv(file, sep='\t')
print(df_raw.shape)
df_raw.drop_duplicates(subset=['name','idcard_md5','phone_md5','loan_dt'], inplace=True)
df_sample = df_raw.sample(n=2000, random_state=1)
print(df_sample.head(), df_sample.shape)
df_sample.to_csv(file_new, encoding='utf-8', sep='\t', index=False)
print('V的空值率:{:.4f}'.format(1-data[V_columns].count()/data.shape[0]))
python切换当前工程目录
import os
print(os.getcwd())
if ~os.getcwd().endswith('./example9/'):
os.chdir('./example9/')
print(os.getcwd())
os.path.join("just", "do", "python", "dot", "com")
os.path.exists(".")
python区间
span_list = ['[%.3f,%.3f]' % (min(data[:, 1]), round(cut_list[1], 3))]
np.vstack(tup)使用
assert 'label' in dftrain.columns, 'illegal input,there should be a "label" column in dftrain!'
assert set(dftrain['label']) == {0,1},'illegal label values,label can only be 0 or 1!'
from datetime import datetime, date
from datetime import timedelta
now_date = (date.today())
one_week_before = (date.today() - timedelta(days=7)).strftime("%Y-%m-%d")
data=data[(data['time']<=str(now_date)) &(data['time']>=str(one_week_before))]
from datetime import datetime, date, timedelta
yesterday = (date.today() + timedelta(days = -1)).strftime("%Y-%m-%d")
def analyseAimVal(data):
sns.distplot(data , fit=norm)
(mu, sigma) = norm.fit(data)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
fig = plt.figure()
res = stats.probplot(data,plot=plt)
plt.show()
analyseAimVal(train['SalePrice'])
import pandas as pd
train = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\jiufu_out_data_0512.csv')
train["cy20_hj_score"] = np.log1p(train["cy20_hj_score"])
analyseAimVal(train["SalePrice"])
环境变量快速进入:
方法一: 选“环境变量”——搞定。
方法二: 按快捷键win+R后,输入“sysdm.cpl”,然后回车,完事。
方法三: 按键盘上的windows键,输入“环境变量”或者“huanjing”,打开“编辑系统环境变量”,完事。
方法四:python环境下
import os
os.environ["path"] += os.pathsep + "D:\Graphviz\bin"
dot_data=xgb.to_graphviz(bst, num_trees=0)
score_slice_arr = np.array_split(sort_score_ls, slice_num) np.array_split进行不均等划分!
AUC (Area Under Curve) 反映不平衡样本分类模型的性能...计算方法:sklearn中metrics.roc_auc_score,阈值计算TPR/FPR
KS(Kolmogorov-Smirnov)衡量数值型变量或有序分类变量对好坏用户的区分度。KS = max{abs(Fn(x)-F0(x))}
IV(information value)衡量多分类变量对好坏用户的区分度。woe_i = ln( (ln(N_bad_in_i /N_total_bad))/(N_good_in_i /N_total_good) ) woei = ln(badi/badT)-ln(goodi/goodT)
IV = sum ([(N_bad_in_i /N_total_bad)-(ln(N_good_in_i /N_total_good))]*woe_i)
PSI(Population stability index) 衡量变量、模型的稳定性. PSI= sum((实际占比-预期占比)* ln(实际占比/预期占比))
WOE = ln (第i个分箱的坏人数 / 第i个分箱的好人数) - ln (总坏人数 / 总好人数) 可以理解为:每个分箱里的坏好比(Odds)相对于总体的坏好比之间的差异性。
WOE证据权重,根据贝叶斯理论用以衡量对先验认识修正的增量,
PSI衡量预期分布和实际分布之间的差异性,IV把这两个分布具体化为好人分布和坏人分布。
IV指标是在从信息熵上比较好人分布和坏人分布之间的差异性。
异常值发现outlier detection feature_analysis_tools.py中值域判断<0 均记为1e-10 分位数识别
缺失值填补imputer xgb里处理,如果缺失,试各分支计算增益选择合适方向
特征选择:1\PSI筛选 2\3月间MaxPSI<0.2q且avgPSI<0.1 3\IV筛选 >0.01或者相对区分度TopN 4\ 特征重要度xgb 5\ 评估:KS,AUC
import random
print(random.randint(1, 50))
print(random.uniform(1, 50))
def jiemi(file_raw,file_key):
df1 = pd.read_csv(file_raw, sep='\t')
print(df1.shape, df1.head())
df1.drop_duplicates(subset=['name', 'idcard_md5', 'phone_md5', 'loan_dt'], inplace=True)
print(df1.shape, df1.head())
df1.to_csv(file_key, encoding='utf-8', sep='\t', index=False)
pycharm直接run的时候:无法自动创建save目录?、run的时候在当前目录D:\tianjikit-master\tj_bank_tools_optimize\example10
Alt + Shift + E 运行目录为 D:\tianjikit-master\tj_bank_tools_optimize,为初始目录
python正则表达式三:^和$
import re
re.findall('^ba','abacd')
re.findall('^ab','abacd')
re.findall('ac$','abacd')
re.findall('cd$','abacd')
m=re.findall('^abacd$','abacd')
print('---------------------------test1---------------------------------', file=open('log.txt', 'w'))
import numpy as np
import pandas as pd
import sys
from feaAnalysis import *
def big_data_split(path):
flag = 0
for i in range(4):
if i == 0:
flag = [0,5,6,3,4] + [j for j in range(7, 10000)]
elif i == 1 or 2:
flag = [0,5,6,3,4]+[j for j in range(10000*i, 10000*(1+i))]
elif i == 3:
flag =[0,5,6,3,4] + [j for j in range(30000, 34742)]
df = pd.read_csv(path, sep='\t', encoding='utf-8', usecols=flag)
df.rename(columns={'sha256_iden': 'idcard', 'sha256_phone': 'phone'}, inplace=True)
file_name = 'yixin_xxx0' + str(i+1) + '.csv'
print (df.shape, df.head(1))
df.to_csv(file_name, sep='\t', encoding='utf-8', index=False)
def concat_to_real(inpath1,inpath2,outpath):
df1 = pd.read_csv(inpath1, sep='\t')
df2 = pd.read_csv(inpath2, sep=',')
print(df1.shape, df2.shape)
df_new = df1.merge(df2, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])
print(df_new.shape, df_new.isnull().sum())
df_new.to_csv(outpath, sep='\t', encoding='utf-8', index=False)
def xgb_select_data(inpath1, inpath2, outpath):
import pandas as pd
df2 = pd.read_csv(inpath2, sep=',')
df1 = pd.read_csv(inpath1, sep='\t')
tmp1, tmp2 = df1.columns.values.tolist(), df2['feature'].tolist()
tmp3 = list(set(tmp1) & set(tmp2))
print len(tmp3)
pubilcData = df1[['name', 'idcard', 'phone', 'loan_dt', 'label']+tmp3]
print pubilcData.shape
pubilcData.to_csv(outpath, sep='\t', encoding='utf-8', index=False)
def concat_four_data():
res = []
for i in range(1, 4):
df1 = pd.read_csv('jiufu_meta02_xxx0'+str(i)+'_6000.csv', sep='\t')
res.append(df1)
df = pd.concat(res, axis=1, ignore_index=False, join="outer")
print df.shape
df.to_csv('jiufu_meta02_6000.csv', sep='\t', encoding='utf-8', index=False)
def huisu_data_split(path, out_trian, out_test):
data = pd.read_csv(path, sep='\t')
res = data.sort_values(by='loan_dt', ascending=True)
train, test = res.iloc[:87303], res.iloc[87303:]
print train.shape, test.shape
train.to_csv(out_trian, sep='\t')
test.to_csv(out_test, sep='\t')
def split_to_concat(file1, file2, file3, file4, base_trian, base_test):
df1 = pd.read_csv(file1, sep='\t')
df2 = pd.read_csv(file2, sep='\t')
df3 = pd.read_csv(file3, sep='\t')
df4 = pd.read_csv(file4, sep='\t')
df = df1.merge(df2, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])
df = df.merge(df3, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])
df = df.merge(df4, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])
res = df.sort_values(by='loan_dt', ascending=True)
train, test = res.iloc[:90131], res.iloc[90131:]
print train.shape, test.shape
train.to_csv(base_trian, sep='\t', encoding='utf-8', index=False)
test.to_csv(base_test, sep='\t', encoding='utf-8', index=False)
def base_outer_concat():
import pandas as pd
df_base = pd.read_csv('xgb_832_data.csv', sep='\t')
df_outer = pd.read_csv('jiufu_out_data_0512.csv', sep=',')
df_base_outer1 = pd.concat([df_base, df_outer['cy20_hj_score']], axis=1, join='outer')
print df_base_outer1.shape
df_base_outer1.to_csv('base_outer_cy20.csv', sep='\t', encoding='utf-8', index=False)
def concat_to_real2():
df1 = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\huisu.csv', sep='\t')
df2 = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\data_0804.csv', sep=',')
print(df1.shape, df2.shape)
df_new = df1.merge(df2, how='outer', on=['name', 'idcard', 'phone', 'loan_dt'])
print(df_new.shape, df_new.isnull().sum())
df_new.to_csv(r'C:\Users\hongshaofeng\Desktop\huisu_new.csv', sep='\t', encoding='utf-8', index=False)
if __name__ == '__main__':
concat_to_real2()
pycharm新建27环境需要用anconda虚拟环境来装。27的就只有基本包。需要手动安装第三方库。
pip freeze > requirements.txt
pip install -r requirements.txt
pip install ./matplotlib-2.2.3-cp36-cp36m-win32.whl
print('%.6f' % (psi(df['cy20_hj_score'],df['model_score'])))
print(round(1.32434,3))
print('{:.3f}'.format(1.23456))
print(format(1.23456, '.2f'))
print('percent: {:.2%}'.format(42/50))
print('%.2f%%' % (2.322*100) )
import warnings
warnings.filterwarnings("ignore")
pip install numpy -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com pandas
pip install ./matplotlib-2.2.3-cp36-cp36m-win32.whl
import pandas as pd
df = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\sas\sg_inner_data.csv',sep=',')
df= df.rename(columns={'Unnamed: 0':'seq'})
fea_list = df.columns.tolist()
with open('feature_list.txt','w') as f:
for i in fea_list:
f.writelines(str(i)+'\n')
res=[]
with open('feature_list.txt','r',encoding='utf-8') as f:
for line in f.readlines():
res.append(line.strip('\n'))
print(res)
result = re.findall(".*entry(.*)for.*",string)
with open(f_list_file, 'r') as f:
for line in f:
line = line.strip()
if line != '':
train_list.append(line)
from datetime import datetime, date, timedelta
yesterday = (date.today() + timedelta(days = -1)).strftime("%Y-%m-%d")
print(yesterday,'\t',df1.shape[0],'\t',df2.shape[0],'\t',
'{:.3%}'.format(df2.shape[0]/(df1.shape[0]+df2.shape[0])),'\t',
'{:.3%}'.format(df1['n30_hj_score'].isnull().sum(axis=0) / df1.shape[0]),'\t',
'{:.3%}'.format(df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0]),'\t',
abs('{:.3%}'.format((df1['n30_hj_score'].isnull().sum(axis=0) / df1.shape[0] - df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0])),'\t',
'{:.3%}'.format(df1['score'].isnull().sum(axis=0) / df1.shape[0]),'\t',
'{:.2%}'.format(df2['score'].isnull().sum(axis=0) / df2.shape[0]),'\t',
abs('{:.3%}'.format(df1['score'].isnull().sum(axis=0) / df1.shape[0] - df2['score'].isnull().sum(axis=0) / df2.shape[0])),'\t',
'{:.3%}'.format((df1['score'].isnull().sum(axis=0) / df1.shape[0]) *0.75 + (df2['score'].isnull().sum(axis=0) / df2.shape[0])*0.25),'\t',
psi,'\t',
abs(df1['n30_hj_score'].skew()-df2['t.v_af_score'].skew()),'\t',
abs(df1['n30_hj_score'].skew() - 0.030766),'\t',abs(df2['t.v_af_score'].skew() - 0.030766),
psi2,'\t',
file=open('log_jryk.txt','a'))
from datetime import datetime, date, timedelta
yesterday = (date.today() + timedelta(days = -1)).strftime("%Y-%m-%d")
print(yesterday,'\t',df1.shape[0],'\t',df2.shape[0],'\t',
'{:.3%}'.format(df2.shape[0]/(df1.shape[0]+df2.shape[0])),'\t',
'{:.3%}'.format(df1['n21_score'].isnull().sum(axis=0) / df1.shape[0]),'\t',
'{:.3%}'.format(df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0]),'\t',
abs('{:.3%}'.format((df1['n21_score'].isnull().sum(axis=0) / df1.shape[0] - df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0])),'\t',
'{:.3%}'.format(df1['score'].isnull().sum(axis=0) / df1.shape[0]),'\t',
'{:.2%}'.format(df2['t.y_score'].isnull().sum(axis=0) / df2.shape[0]),'\t',
abs('{:.3%}'.format((df1['score'].isnull().sum(axis=0) / df1.shape[0] - df2['t.y_score'].isnull().sum(axis=0) / df2.shape[0])),'\t',
'{:.3%}'.format((df1['score'].isnull().sum(axis=0) / df1.shape[0]) *0.92 + (df2['t.y_score'].isnull().sum(axis=0) / df2.shape[0])*0.08),'\t',
psi,'\t',
abs(df1['n21_score'].skew()-df2['t.v_af_score'].skew()),'\t',
abs(df1['n21_score'].skew() + 0.211146),'\t',abs(df2['t.v_af_score'].skew() + 0.211146),
psi2,'\t',
file=open('log_wjwl.txt','a'))
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,filename='xf.log', filemode='w')
logging.info('data loading...')
def concat_data():
df1=pd.read_csv('360_key_1w.txt',sep='\t')
df1['flag']=1
df1.drop_duplicates(subset=['name','idcard','phone','loan_dt'],inplace=True)
df=df.merge(df1, how='left', on=['name', 'idcard', 'phone', 'loan_dt'])
print(df)
df=df[df['flag'].isnull()]
df[['name', 'idcard', 'phone', 'loan_dt']].to_csv('360_key_9w.txt', sep='\t', index=False, encoding='utf-8')
print(df)