机器学习建模（三）

1 建模，导包第一步

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, make_scorer, confusion_matrix
from scipy import stats
import numpy as np
from sklearn.preprocessing import StandardScaler

2 计算AUC和95%置信区间

# 计算 AUC 的 95% 置信区间
def calculate_ci(auc_score, n, confidence=0.95):
    q1 = auc_score * (1 - auc_score)
    z = stats.norm.ppf(1 - (1 - confidence) / 2)
    se = np.sqrt(q1 / n)
    ci_low = auc_score - z * se
    ci_up = auc_score + z * se
    return max(0, ci_low), min(1, ci_up)

3 计算灵敏度和特异性

# 计算灵敏度和特异性
def calculate_sensitivity_specificity(y_true, y_pred_prob, threshold):
    y_pred_label = (y_pred_prob >= threshold).astype(int)  # 将预测概率转换为标签
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred_label).ravel() #混淆矩阵
    sensitivity = tp / (tp + fn)  # 计算灵敏度
    specificity = tn / (tn + fp)  # 计算特异性
    return sensitivity, specificity

4 计算最佳cutoff（阈值）

# 找到最佳阈值
def find_best_cutoff(y_true, y_pred_prob):
    best_threshold = 0.0
    best_youden_index = -1
    best_sensitivity = 0.0
    best_specificity = 0.0
    thresholds = np.arange(0.01, 1.00, 0.01)  # 测试一系列阈值（从0.01到0.99）
    
    for threshold in thresholds:
        sensitivity, specificity = calculate_sensitivity_specificity(y_true, y_pred_prob, threshold)
        
        # 计算Youden's Index
        youden_index = sensitivity + specificity - 1
        
        # 如果找到更好的Youden's Index，更新最佳阈值
        if youden_index > best_youden_index:
            best_youden_index = youden_index
            best_threshold = threshold
            best_sensitivity = sensitivity
            best_specificity = specificity
    
    return best_threshold, best_sensitivity, best_specificity

5 存储计算的结果

results = {}  # 存储结果

6 数据标准化

X_tr = data_tr[selected_features] # selected_features特征子集
y_tr = data_tr['group'] # group是标签
X_val = data_te[selected_features]
y_val = data_te['group']

# 标准化
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X_tr)  
X_val_scaled = scaler.transform(X_val)

7 模型定义

models = {
    "   Random Forest   ": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "        SVM        ": SVC(probability=True),
    "      XGBoost      ": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "     AdaBoost      ": AdaBoostClassifier(algorithm='SAMME')
}
# 5次5折交叉验证
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(roc_auc_score, response_method='predict_proba')

8 模型训练、验证、结果

for name, model in models.items():
    # 交叉验证计算 AUC
    auc_scores = cross_val_score(model, X_scaled, y_tr, cv=kf, scoring=scorer)
    mean_auc = auc_scores.mean()
    std_auc = auc_scores.std()
    
    # 在整个训练集上拟合模型
    model.fit(X_scaled, y_tr)

    # 在验证集上进行预测
    y_val_pred_prob = model.predict_proba(X_val_scaled)[:, 1]
    auc_val = roc_auc_score(y_val, y_val_pred_prob)
    
    # 计算 AUC 的 95% 置信区间
    ci_low, ci_up = calculate_ci(auc_val, len(y_val))
     
     # 找到最佳cutoff
    best_threshold, best_sensitivity, best_specificity = find_best_cutoff(y_val, y_val_pred_prob)
    
    # 存储结果
    results[name] = {
        'mean_auc': mean_auc,
        'std_auc': std_auc,
        'auc_val': auc_val,
        'ci_low': ci_low,
        'ci_up': ci_up,
        'best_threshold': best_threshold,
        'best_sensitivity': best_sensitivity,
        'best_specificity': best_specificity
    }

    # 打印结果
    print(f"{name}:\n Mean AUC (CV) = {mean_auc:.4f}, AUC (Val) = {auc_val:.4f}, 95% CI ({ci_low:.4f}, {ci_up:.4f})")
    print(f"Best Threshold = {best_threshold:.2f}, Sensitivity = {best_sensitivity:.4f}, Specificity = {best_specificity:.4f}")
    print("--------------------------------------------------")

机器学习建模（三）

悦读