Knn算法实现
k近邻算法¶
0.引入依赖¶
In [8]:
import numpy as np
import pandas as pd
#这里直接引入sklearn里面的数据集,iris 鸢尾花
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split # 切分数据集为训练集和测试集
from sklearn.metrics import accuracy_score #计算分类预测的准确率
1.数据加载和预处理¶
In [23]:
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns = iris.feature_names)
df['class'] = iris.target
df['class'] = df['class'].map( lambda i:iris.target_names[i] )
df.describe()
Out[23]:
In [24]:
x = iris.data
y = iris.target.reshape(-1,1)
In [33]:
#划分训练接和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=35,stratify = y)
Out[33]:
In [102]:
arr=np.argsort(np.array([1,5,3,4]))[:3]
test=[np.array([1,5,3,4])[a] for a in arr]
test_2=np.array([1,5,3,4])[arr]
test_2.tolist().count(1)
Out[102]:
In [109]:
np.argmax([1,5,3,4])
# np.bincount([1,1,2,3,'1x'])
2.核心算法实现¶
In [150]:
# 距离函数定义
def l1_distance(a,b):
return np.sum(np.abs(a-b),axis=1)
def l2_distance(a,b):
return np.sqrt(np.sum((a-b)**2,axis=1))
# 分类器实现
class kNN(object):
#定义一个初始化方法, __init__ 是类的构造方法
def __init__(self,n_neighbors=1,dist_func= l1_distance):
self.n_neighbors=n_neighbors
self.dist_func=dist_func
# 训练模型的方法
def fit(self,x,y):
self.x_train = x
self.y_train = y
# 模型预测
def predict(self, x):
# 初始化预测分类数组
y_pred = np.zeros((x.shape[0],1),dtype=self.y_train.dtype)
#遍历输入的x数据点
for i,x_test in enumerate(x):
# x_test和所有训练数据计算距离
distances=self.dist_func(self.x_train,x_test)
# 对得到的距离按照由近到远排序
nn_indexes=np.argsort(distances)[:self.n_neighbors]
#选取其中最近的k个点,统计类别出现频率最高的那个,赋给y_predict[i]
# y_res=[y_train[a] for a in nn_indexes]
y_res=y_train[nn_indexes].ravel().tolist()
# y_pred[i] = np.argmax([y_res.count(0),y_res.count(1),y_res.count(2)])
y_pred[i] = np.argmax(np.bincount(y_res))
return y_pred
In [160]:
kNN_model=kNN(n_neighbors=5,dist_func= l1_distance)
kNN_model.fit(x_train,y_train)
y_pred=kNN_model.predict(x_test)
In [161]:
accuracy_score(y_test,y_pred)
Out[161]:
In [166]:
#比对各个参数的好坏
knn=kNN()
knn.fit(x_train,y_train)
result_list=[]
for p in [1,2]:
knn.dist_func=l1_distance if p==1 else l2_distance
#考虑不同的k取值
for k in range(1,10,2):
knn.n_neighbors=k
y_pred=knn.predict(x_test)
accuracy= accuracy_score(y_test,y_pred)
print(accuracy)
result_list.append([knn.n_neighbors,knn.dist_func.__name__,accuracy])
df = pd.DataFrame(result_list,columns=['k',"距离函数","准确率"])
df
Out[166]:
In [ ]:
In [ ]: