import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, recall_score
# 自定义评价函数:edge_separation_and_ks
def edge_separation_and_ks(y_true, y_pred):
# 计算得分最高和最低的1%样本的预测概率阈值
num_samples = len(y_pred)
num_edge_samples = int(num_samples * 0.01)
cutoff_threshold_high = sorted(y_pred)[::-1][num_edge_samples]
cutoff_threshold_low = sorted(y_pred)[num_edge_samples]
# 将预测概率在两个阈值之间的样本视为不确定性样本
uncertainty_mask = (y_pred > cutoff_threshold_low) & (y_pred < cutoff_threshold_high)
# 计算得分高和低的1%样本的平均真实标签分数差
y_true_sorted = [t for _, t in sorted(zip(y_pred, y_true))]
high_score_samples = y_true_sorted[:num_edge_samples]
low_score_samples = y_true_sorted[-num_edge_samples:]
score_diff = sum(high_score_samples) / num_edge_samples - sum(low_score_samples) / num_edge_samples
# 计算KS值
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
ks = max(tpr - fpr)
# 综合考虑Edge Separation和KS作为评价值
return 'edge_separation_and_ks', score_diff * (1 + uncertainty_mask.mean()) * ks, True
# 数据准备
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)
# 定义优化目标函数:maximize_edge_separation_and_ks
def maximize_edge_separation_and_ks(trial):
# 参数搜索空间
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'None',
'num_leaves': trial.suggest_int('num_leaves', 16, 128),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
'verbose': -1
}
# 训练模型并计算Edge Separation和KS评价值
model = lgb.train(params,
train_data,
valid_sets=[train_data, val_data],
early_stopping_rounds=100,
evals_result={},
num_boost_round=5000,
verbose_eval=False,
feval=edge_separation_and_ks)
y_pred_val = model.predict(X_val)
edge_separation_and_ks_val = edge_separation_and_ks(y_val, y_pred_val)[1]
return edge_separation_and_ks_val
# 创建Optuna试验对象并运行优化
study = optuna.create_study(direction='maximize')
study.optimize(maximize_edge_separation_and_ks, n_trials=50)
# 输出最佳参数和Edge Separation和KS评价值
print('Best Parameters: ', study.best_params)
best_model = lgb.train({**study.best_params, 'metric': 'None'},
train_data,
valid_sets=[train_data, val_data],
early_stopping_rounds=100,
evals_result={},
num_boost_round=5000,
verbose_eval=False,
feval=edge_separation_and_ks)
y_pred_test = best_model.predict(X_test)
edge_separation_and_ks_test = edge_separation_and_ks(y_test, y_pred_test)[1]
print('Edge Separation and KS on Test Set: {:.4f}'.format(edge_separation_and_ks_test))