Bootstrap

统计机器学习——线性回归与分类

chapter2 线性回归与分类

例2.1

import pandas as pd
data = pd.read_csv("../data/第2章数据/diabetes.csv",index_col=0)

Index = data.columns
xtitle = [index for index in Index if 'x.' in index]
x2title = [index for index in Index if 'x2.' in index]
xdata = data[xtitle]
x2data = data[x2title]
ydata = data['y']
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy
X = sm.add_constant(x2data,prepend=True)
lm = sm.OLS(ydata,X)
lm_result = lm.fit()
# dir(lm_result)
# lm_result.summary()
y_hat = lm_result.fittedvalues
res = lm_result.resid
plt.figure()
plt.plot(y_hat,res,'.k')
#错误写法
'''
# plt.xlabel='yhat'  # Set x-axis label
# plt.ylabel='residuals'  # Set y-axis label
# plt.title='residuals vs yhat'  # Set title
'''
#正确写法
plt.xlabel('yhat')  # Set x-axis label
plt.ylabel('residuals')  # Set y-axis label
plt.title('residuals vs yhat')  # Set title
plt.show()

在这里插入图片描述

W,p_value=scipy.stats.shapiro(res)
W,p_value
(0.9937732815742493, 0.06650751084089279)

例2.2

import numpy as np
def kappa(x):
  x = np.array(x)
  XX = np.dot(x.T,x)
  lam = np.linalg.eigvals(XX)
  return(np.sqrt(lam.max()/lam.min()))
kappa(xdata)
21.68154463827331
import matplotlib.pyplot as plt
from sklearn import linear_model

#路径求解
n_alphas = 200
alphas = np.logspace(-5,3, n_alphas)
coefs = []
for a in alphas:
  ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
  ridge.fit(xdata, ydata)
  coefs.append(ridge.coef_)

reg = linear_model.RidgeCV(alphas=np.logspace(-6,6,13))
reg.fit(xdata, ydata)    
reg.alpha_  

0.01
# Plot Ridge coefficients
ax = plt.gca()
ax.plot(alphas, coefs, label=xdata.columns)
ax.set_xscale("log")   # x-axis in log scale
ax.set_xlim(ax.get_xlim())
ax.legend(loc='upper right')   # Set label to the right
plt.axvline(reg.alpha_, linestyle="--", color="black", label='alpha: CV estimate')
#设置坐标轴标签
plt.xlabel('alpha')
plt.ylabel('weights')

plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

在这里插入图片描述

例2.3

from sklearn.linear_model import LassoCV
lasso = linear_model.LassoCV(cv=4).fit(X,ydata)

n_alphas=20
alphas=np.logspace(-2,1,n_alphas)
clf=linear_model.Lasso(fit_intercept=False)
clf.fit(xdata,ydata)

coefs=[]
for a in alphas:
    clf.set_params(alpha=a)
    clf.fit(xdata,ydata)
    coefs.append(clf.coef_)
# Plot Ridge coefficients
ax = plt.gca()
ax.plot(alphas, coefs, label=xdata.columns)
ax.set_xscale("log")  # x-axis in log scale
ax.set_xlim(ax.get_xlim())
ax.legend(loc='upper right')  # Set label to the right
plt.axvline(reg.alpha_, linestyle="--", color="black", label='alpha: CV estimate')
plt.xlabel('alpha')  # Set x-axis label
plt.ylabel('weights')  # Set y-axis label
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

在这里插入图片描述

from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
import time
t1=time.time()
model_bic=LassoLarsIC(criterion='bic',fit_intercept=False)
model_bic.fit(X,ydata)
t_bic=time.time()-t1
alpha_bic_=model_bic.alpha_

model_aic=LassoLarsIC(criterion='aic',fit_intercept=False)
model_aic.fit(X,ydata)
alpha_aic_=model_aic.alpha_

def plot_ic_criterion(model,name,color):
  alpha_=model.alpha_
  alphas_=model.alphas_
  criterion_=model.criterion_
  plt.plot(-np.log10(alphas_),criterion_,'--',color=color,
  linewidth=3,label='%s criterion'%name)
  plt.axvline(-np.log10(alpha_),color=color,linewidth=3,
  label='alpha:%s estimate'%name)
  plt.xlabel('-log(alpha)',fontdict={
   'size':8})
  plt.ylabel('criterion',fontdict={
   'size':
;