Bootstrap

基于逻辑回归、GBDT、AdaBoost模型的客户流失预测(1)

# 导入所需的包
library(gbm)
library(ggplot2)
library(ada)
library(class)
library(MASS)
library(caret)

load_data = function(file_path) {
  names = c('state', 'acc_length', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'num_vm_message', 'day_min',
            'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge', 'night_min', 'night_calls',
            'night_charge', 'inter_min', 'inter_calls', 'inter_charge', 'cus_ser_calls', 'churn')
  
  data = read.csv(file_path, header = TRUE)
  colnames(data) = names
  return(data)
}

check_data_feature = function(data) {
  print(dim(data))
  print(str(data))
  print(table(data$churn))
}

draw_feature_plot = function(data){
  par(mfrow = c(1,2))
  barplot(table(data$churn), col = "skyblue", main = "churn True or False", las = 1)
  barplot(table(data$cus_ser_calls), 
          col = "skyblue", main = "customer service calls times", las = 1)
  
  par(mfrow = c(4, 3), mar = c(3, 3, 2, 1))
  name <- c('day_min', 'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge',
            'night_min', 'night_calls', 'night_charge', 'inter_min', 'inter_calls', 'inter_charge')
  for (i in seq_along(name)) {
    numdata = as.numeric(data[[name[i]]])
    hist(numdata, main = paste('Density of', name[i]), xlab = name[i], col = 'skyblue', 
         border = 'black', probability = TRUE)
    lines(density(numdata), col = 'red', lwd = 2)
  }
}

feature_associated = function(data){
  inter_plan_counts <- table(data$inter_plan, data$churn)
  df_inter <- as.data.frame(inter_plan_counts)
  names(df_inter) <- c('inter_plan', 'churn', 'count')
  p1 <- ggplot(df_inter, aes(x = inter_plan, y = count, fill = churn)) +
    geom_bar(stat = 'identity', position = 'stack') +
    labs(title = 'Inter or No Inter of Churn', x = 'Inter or Not Inter', y = 'Number')
  
  cus_calls_counts <- table(data$cus_ser_calls, data$churn)
  df_cus <- as.data.frame(cus_calls_counts)
  names(df_cus) <- c('cus_ser_calls', 'churn', 'count')
  p2 <- ggplot(df_cus, aes(x = cus_ser_calls, y = count, fill = churn)) +
    geom_bar(stat = 'identity', position = 'stack') +
    labs(title = 'Customer Service Calls about Churn', x = 'Customer Service Calls', y = 'Numbers') +
    theme_minimal()
  print(p1)
  print(p2)
}

deal_data <- function(data) {
  # 分离目标变量
  y <- ifelse(data$churn == "True", 1, 0)
  
  # 将分类变量转换为虚拟变量
  new_inter <- model.matrix(~inter_plan - 1, data)
  new_vm_plan <- model.matrix(~vm_plan - 1, data)
  
  # 合并数据
  data_temp <- cbind(data, new_inter, new_vm_plan)
  
  # 删除无用的特征
  to_drop <- c('state', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'churn')
  data_df <- data_temp[, !(names(data_temp) %in% to_drop)]
  
  # 标准化特征数据
  X <- scale(data_df)
  
  return(list(X = X, y = y))
}

choose_algorithm = function(X, y){
  set.seed(1)
  train_index <- sample(3333, 2333)
  
  # 从全体数据中选择训练集和测试集
  X_train <- X[train_index, ]
  X_test <- X[-train_index, ]
  y_train <- y[train_index]
  y_test <- y[-train_index]
  # 逻辑回归
  fit_lr <- glm(y_train ~ ., data = as.data.frame(X_train), family = binomial)
  pred_test_lr <- predict(fit_lr, newdata = as.data.frame(X_test), type = "response") > 0.5
  conf_matrix_test_lr <- table(predicted = pred_test_lr, Actual = y_test)
  Accuracy_test_lr <- sum(diag(conf_matrix_test_lr)) / sum(conf_matrix_test_lr)
  
  # 线性判别分析
  fit_lda <- lda(y_train ~ ., data = as.data.frame(X_train))
  pred_test_lda <- predict(fit_lda, newdata = as.data.frame(X_test))$class
  conf_matrix_test_lda <- table(predicted = pred_test_lda, Actual = y_test)
  Accuracy_test_lda <- sum(diag(conf_matrix_test_lda)) / sum(conf_matrix_test_lda)
  
  # K最近邻
  fit_knn <- knn(X_train, X_test, cl = y_train, k = 3)
  conf_matrix_test_knn <- table(predicted = fit_knn, Actual = y_test)
  Accuracy_test_knn <- sum(diag(conf_matrix_test_knn)) / sum(conf_matrix_test_knn)
  
  # 输出每个模型的测试集准确率
  cat("Logistic Regression Accuracy:", Accuracy_test_lr, "\n")
  cat("Linear Discriminant Analysis Accuracy:", Accuracy_test_lda, "\n")
  cat("K-Nearest Neighbors Accuracy:", Accuracy_test_knn, "\n")
  
  # 创建一个数据框,包含模型名称和准确率
  df <- data.frame(Model = c("LR", "LDA", "KNN(k=3)"),
                   Accuracy = c(Accuracy_test_lr, Accuracy_test_lda, Accuracy_test_knn))
  
  # 使用 ggplot2 绘制箱线图
  ggplot(df, aes(x = Model, y = Accuracy)) +
    geom_boxplot(fill = "lightblue", color = "darkblue") +
    geom_point(size = 3, color = "red") +
    labs(title = "Model Accuracies on Test Set", y = "Accuracy") +
    theme_minimal()
}

improve_result = function(X, y){
  set.seed(1)
  train_index <- sample(3333, 2333)
  # 从全体数据中选择训练集和测试集
  X_train <- X[train_index, ]
  X_test <- X[-train_index, ]
  y_train <- y[train_index]
  y_test <- y[-train_index]

  # 使用 gbm 进行梯度提升
  gb_model <- gbm(y_train ~ ., data = as.data.frame(X_train), distribution = "bernoulli", n.trees = 100, interaction.depth = 1, shrinkage = 0.1)
  
  # 在训练集上的预测
  prob_train <- predict(gb_model, newdata = as.data.frame(X_train), n.trees = 100, type = "response")
  pred_train <- prob_train > 0.5
  
  # 在测试集上的预测
  prob_test <- predict(gb_model, newdata = as.data.frame(X_test), n.trees = 100, type = "response")
  pred_test <- prob_test > 0.5
  
  # 创建混淆矩阵并计算准确率
  conf_matrix_train <- table(predicted = pred_train, Actual = y_train)
  accuracy_train <- sum(diag(conf_matrix_train)) / sum(conf_matrix_train)
  
  conf_matrix_test <- table(predicted = pred_test, Actual = y_test)
  accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
  
  cat("Gradient Boosting训练集准确率:", accuracy_train, "\n")
  cat("Gradient Boosting测试集准确率:", accuracy_test, "\n")
  
  # 使用 ada 进行训练
  ada_model <- ada(y_train ~ ., data = as.data.frame(cbind(y_train, X_train)), iter = 100)
  # 在训练集上的预测
  pred_train_ada <- predict(ada_model, newdata = as.data.frame(X_train))
  
  # 在测试集上的预测
  pred_test_ada <- predict(ada_model, newdata = as.data.frame(X_test))
  
  # 创建混淆矩阵并计算准确率
  conf_matrix_train_ada <- table(predicted = pred_train_ada, Actual = y_train)
  accuracy_train_ada <- sum(diag(conf_matrix_train_ada)) / sum(conf_matrix_train_ada)
  
  conf_matrix_test_ada <- table(predicted = pred_test_ada, Actual = y_test)
  accuracy_test_ada <- sum(diag(conf_matrix_test_ada)) / sum(conf_matrix_test_ada)
  
  cat("AdaBoost训练集准确率:", accuracy_train_ada, "\n")
  cat("AdaBoost测试集准确率:", accuracy_test_ada, "\n")
}

hunxiao = function(){
  # 创建一个简单的混淆矩阵数据框
  conf_matrix_data <- data.frame(
    Actual = rep(c("0", "1"), each = 2),
    Predicted = rep(c("0", ""), times = 2),
    Value = c(869, 34, 4, 103)  # 修改为新的混淆矩阵的实际值
  )
  
  # 使用 ggplot2 创建混淆矩阵图
  ggplot(conf_matrix_data, aes(x = Actual, y = Predicted, fill = Value)) +
    geom_tile() +
    geom_text(aes(label = Value), vjust = 1) +
    scale_fill_gradient(low = "lightblue", high = "darkblue") +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.title = element_blank(),
      axis.ticks = element_blank(),
      panel.grid = element_blank(),
      legend.position = "none"
    ) +
    labs(title = "Custom Confusion Matrix")
}

main = function(){
  file_path = "C:/Users/27128/Desktop/R_project/R_P/data.csv"
  data = load_data(file_path)
  check_data_feature(data)
  draw_feature_plot(data)
  feature_associated(data)
  result <- deal_data(data)
  X <- result$X
  y <- result$y
  choose_algorithm(X, y)
  improve_result(X, y)
  hunxiao()
}

main()

;