import torch
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
#⾃定义逻辑回归模型
class LogisticRegression(object):
#定义初始化⽅法
def __init__(self, max_iters=1000, learning_rate=1e-9):
#参数初始化
self.max_iters = max_iters
self.learning_rate = learning_rate
self.w = None
self.b = None
self.n_features = None
self.losses = []
self.train_accs = []
self.test_accs = []
#保存模型
self.best_w = None
self.best_b = None
self.best_train_acc = -float("inf")
self.best_test_acc = -float("inf")
#定义逻辑回归⽅法
def _logic_regression(self, X):
'''
逻辑回归的正向传播过程
'''
return torch.sigmoid(X @ self.w + self.b)
#定义损失⽅法
def _get_loss(self, y_true, y_pred):
'''
⽤MSE来衡量误差
'''
return ((y_true - y_pred) ** 2).mean()
#定义准确率⽅法
def _get_acc(self, X, y):
#预测结果
y_pred = self.predict(X=X)
y_true = y.view(-1)
acc = (y_true == y_pred).to(dtype=torch.float32).mean()
return acc
#定义训练⽅法
def fit(self, X, y):
self.n_features = X.size(1)
#初始化w和b
self.w = torch.randn(size=(self.n_features, 1), requires_grad=True)
self.b = torch.zeros(size=(1,) ,requires_grad=True)
#开始迭代,梯度下降
for step in range(self.max_iters):
#第⼀步:正向传播
y_pred = self._logic_regression(X=X)
#第⼆步:求损失函数
loss = self._get_loss(y_true=y, y_pred=y_pred)
self.losses.append(loss.data.cpu().item())
self.train_accs.append(self._get_acc(X=X_train, y=y_train))
self.test_accs.append(self._get_acc(X=X_test, y=y_test))
#第三步:反向传播
loss.backward()
#第四步:优化⼀步
self.w.data -= self.learning_rate * self.w.grad
self.b.data -= self.learning_rate * self.b.grad
#清空梯度
self.w.grad.zero_()
self.b.grad.zero_()
self.best_train_acc = self.train_accs[-1] if self.train_accs[-1] > self.best_train_acc else self.best_train_acc
self.best_test_acc = self.test_accs[-1] if self.test_accs[-1] > self.best_test_acc else self.best_test_acc
#第六步:判断是否继续迭代
if len(self.losses) >= 2:
if np.abs(self.losses[-1] - self.losses[-2]) <= 1e-6:
#保存模型
self.best_w = self.w
self.best_b = self.b
print(f"提前终⽌迭代了,总共迭代次数为: {step + 1}")
break
#定义推理⽅法
def predict(self, X):
y_pred = self._logic_regression(X=X)
return (y_pred >= 0.5).to(dtype=torch.long).view(-1)
#导⼊数据
X = []
y = []
with open(file="breast_cancer.csv", mode='r', encoding='utf8') as f:
f.readline()
for line in f:
line = line.strip()
if line:
line = [float(ele) for ele in line.split(",")]
X.append(line[:-1])
y.append(line[-1])
X = torch.tensor(data=X, dtype=torch.float32)
y = torch.tensor(data=y, dtype=torch.float32).view(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#数据预处理,规范化
_mean = X_train.mean(dim=0)
_std = X_train.std(dim=0) + 1e-9
X_train = (X_train - _mean) / _std
X_test = (X_test - _mean) / _std
#构建模型
lr = LogisticRegression(max_iters=100000, learning_rate=1e-2)
#训练模型
lr.fit(X=X_train, y=y_train)
#绘图
plt.plot(lr.losses)
plt.plot(lr.train_accs, label='train_acc')
plt.plot(lr.test_accs, label='test_acc')
plt.legend()
plt.title(label="Accs on Train and Test dataset")
plt.xlabel(xlabel="iters")
plt.ylabel(ylabel="Acc")
#输出最佳模型数据
print(f'最佳的权重w:{lr.best_w},最佳的偏置b:{lr.best_b},最佳的训练集准确率:{lr.best_train_acc},最佳的测试集准确率:{lr.best_test_acc}')
#保存模型
model = joblib.dump(lr, 'logistic_regression.model')
'''
下⾯模拟在另⼀台⽣产机器上加载模型,并做后续操作
'''
#加载模型
model = joblib.load('logistic_regression.model')
#使⽤best模型进⾏推理
best_model = LogisticRegression(max_iters=100)
best_model.w = model.best_w
best_model.b = model.best_b
y_pred = best_model.predict(X=X_test)
acc = (y_pred == y_test.view(-1)).to(dtype=torch.float32).mean().data.cpu().item()
acc