目录
案例背景
某在线教育平台希望通过分析用户的学习行为数据,了解用户的学习习惯、课程偏好、学习效果等,从而优化课程推荐系统、提升用户体验、提高用户留存率和课程转化率。
代码实现
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 数据读取
data = pd.read_csv('online_education_users.csv')
# 数据探索性分析
print('数据基本信息:')
data.info()
# 查看数据集行数和列数
rows, columns = data.shape
if rows < 1000:
# 小数据集(行数少于 1000)查看全量数据信息
print('数据全部内容信息:')
print(data.to_csv(sep='\t', na_rep='nan'))
else:
# 大数据集查看数据前几行信息
print('数据前几行内容信息:')
print(data.head().to_csv(sep='\t', na_rep='nan'))
# 数据清洗
# 处理缺失值
data['age'] = data['age'].fillna(data['age'].median())
data = data.dropna(subset=['completed_courses', 'course_rating'])
# 特征工程
# 计算学习时长总和
data['total_study_time'] = data[['study_time_weekday', 'study_time_weekend']].sum(axis=1)
# 对课程类别进行独热编码
course_category_dummies = pd.get_dummies(data['course_category'], prefix='course_cat')
data = pd.concat([data, course_category_dummies], axis=1)
# 特征选择
features = ['age', 'total_study_time', 'completed_courses', 'course_rating'] + list(course_category_dummies.columns)
target = 'is_premium_user'
X = data[features]
y = data[target]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 模型训练
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
# 模型预测
y_pred = model.predict(X_test)
# 模型评估
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"准确率: {accuracy}")
print(f"精确率: {precision}")
print(f"召回率: {recall}")
print(f"F1 值: {f1}")
# 数据可视化
# 不同课程类别的平均课程评分柱状图
category_avg_rating = data.groupby('course_category')['course_rating'].mean()
plt.figure(figsize=(10, 6))
sns.barplot(x=category_avg_rating.index, y=category_avg_rating.values)
plt.title('不同课程类别的平均课程评分&#