1 建模,导包第一步
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, make_scorer, confusion_matrix
from scipy import stats
import numpy as np
from sklearn.preprocessing import StandardScaler
2 计算AUC和95%置信区间
# 计算 AUC 的 95% 置信区间
def calculate_ci(auc_score, n, confidence=0.95):
q1 = auc_score * (1 - auc_score)
z = stats.norm.ppf(1 - (1 - confidence) / 2)
se = np.sqrt(q1 / n)
ci_low = auc_score - z * se
ci_up = auc_score + z * se
return max(0, ci_low), min(1, ci_up)
3 计算灵敏度和特异性
# 计算灵敏度和特异性
def calculate_sensitivity_specificity(y_true, y_pred_prob, threshold):
y_pred_label = (y_pred_prob >= threshold).astype(int) # 将预测概率转换为标签
tn, fp, fn, tp = confusion_matrix(y_true, y_pred_label).ravel() #混淆矩阵
sensitivity = tp / (tp + fn) # 计算灵敏度
specificity = tn / (tn + fp) # 计算特异性
return sensitivity, specificity
4 计算最佳cutoff(阈值)
# 找到最佳阈值
def find_best_cutoff(y_true, y_pred_prob):
best_threshold = 0.0
best_youden_index = -1
best_sensitivity = 0.0
best_specificity = 0.0
thresholds = np.arange(0.01, 1.00, 0.01) # 测试一系列阈值(从0.01到0.99)
for threshold in thresholds:
sensitivity, specificity = calculate_sensitivity_specificity(y_true, y_pred_prob, threshold)
# 计算Youden's Index
youden_index = sensitivity + specificity - 1
# 如果找到更好的Youden's Index,更新最佳阈值
if youden_index > best_youden_index:
best_youden_index = youden_index
best_threshold = threshold
best_sensitivity = sensitivity
best_specificity = specificity
return best_threshold, best_sensitivity, best_specificity
5 存储计算的结果
results = {} # 存储结果
6 数据标准化
X_tr = data_tr[selected_features] # selected_features特征子集
y_tr = data_tr['group'] # group是标签
X_val = data_te[selected_features]
y_val = data_te['group']
# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)
7 模型定义
models = {
" Random Forest ": RandomForestClassifier(),
"Logistic Regression": LogisticRegression(),
" SVM ": SVC(probability=True),
" XGBoost ": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
" AdaBoost ": AdaBoostClassifier(algorithm='SAMME')
}
# 5次5折交叉验证
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(roc_auc_score, response_method='predict_proba')
8 模型训练、验证、结果
for name, model in models.items():
# 交叉验证计算 AUC
auc_scores = cross_val_score(model, X_scaled, y_tr, cv=kf, scoring=scorer)
mean_auc = auc_scores.mean()
std_auc = auc_scores.std()
# 在整个训练集上拟合模型
model.fit(X_scaled, y_tr)
# 在验证集上进行预测
y_val_pred_prob = model.predict_proba(X_val_scaled)[:, 1]
auc_val = roc_auc_score(y_val, y_val_pred_prob)
# 计算 AUC 的 95% 置信区间
ci_low, ci_up = calculate_ci(auc_val, len(y_val))
# 找到最佳cutoff
best_threshold, best_sensitivity, best_specificity = find_best_cutoff(y_val, y_val_pred_prob)
# 存储结果
results[name] = {
'mean_auc': mean_auc,
'std_auc': std_auc,
'auc_val': auc_val,
'ci_low': ci_low,
'ci_up': ci_up,
'best_threshold': best_threshold,
'best_sensitivity': best_sensitivity,
'best_specificity': best_specificity
}
# 打印结果
print(f"{name}:\n Mean AUC (CV) = {mean_auc:.4f}, AUC (Val) = {auc_val:.4f}, 95% CI ({ci_low:.4f}, {ci_up:.4f})")
print(f"Best Threshold = {best_threshold:.2f}, Sensitivity = {best_sensitivity:.4f}, Specificity = {best_specificity:.4f}")
print("--------------------------------------------------")