课程名称:Python3入门机器学习 经典算法与应用 入行人工智能 课程章节:4-1;4-2;4-3 主讲老师:liuyubobobo
工作原理:加入新来了一个x点 在离散点中找到k个离x点最近的点,假如最近的为0的多,判断x=0 反之为1
准备工作 import numpy as np import matplotlib.pyplot as plt X_train = np.array(raw_data_X) y_train = np.array(raw_data_y) plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1]) plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1]) plt.show()
X_train[y_train==0,0] #含义为,找到X_train中下标为(满足y_train==0)的第1列 #一个新输入的点,判断是不是有肿瘤 x = np.array([8.093607318, 3.365731514]) plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1]) plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1]) plt.scatter(x[0],x[1]) plt.show() #从图中很好地看出新的点属于y_train==1 #那么使用knn算法是怎么样用的呢
#先创建距离的列表 from math import sqrt distances = [ ] t=1 for x_train in X_train: #拿出一行数据减去x点,计算他的欧拉距离 d = sqrt(np.sum((x_train - x)**2)) distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train ] #输出一个排序好的下标的数列 nearst=np.argsort(distances) #knn找6个最近的数 k=6 #最近的k个点的y的坐标 topK_y = [y_train[i] for i in nearst[:k]] #获得了离x点最近的k个点所代表的y_train的值 #然后数一下 topK_y #找一下元素中出现的数字和次数 from collections import Counter Counter(topK_y) votes = Counter(topK_y) #利用这个函数找出,找出票数最多的一个函数 votes.most_common(1) #返回一个数组 #预测的y的值,预测肿瘤为恶行肿瘤 predict_y = votes.most_common(1)[0][0] predict_y
#准备工作,导入训练数据,和新的点,来预测肿瘤是否为恶性肿瘤 raw_data_X =np.array( [[3.393533211, 2.331273381], [3.110073483, 1.781539638], [1.343808831, 3.368360954], [3.582294042, 4.679179110], [2.280362439, 2.866990263], [7.423436942, 4.696522875], [5.745051997, 3.533989803], [9.172168622, 2.511101045], [7.792783481, 3.424088941], [7.939820817, 0.791637231] ]) raw_data_y =np.array( [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) X_train = np.array(raw_data_X) y_train = np.array(raw_data_y) x = np.array([[8.093607318, 3.365731514]]) from sklearn.neighbors import KNeighborsClassifier #调用封装好的函数 #创建对象n_neighbors=6 == k=6 KNN_classifiter = KNeighborsClassifier(n_neighbors=6) #fit拟合,传入训练数据集,和特征向量 KNN_classifiter.fit(raw_data_X,raw_data_y) #输入x矩阵 KNN_classifiter.predict([x]) #得到预测结果
import numpy as np from math import sqrt from collections import Counter def KNN_classify(k, X_train, y_train, x): assert 1 <= k <= X_train.shape[0],'k is not nice' assert X_train.shape[0] == y_train.shape[0],\ 'the data X_train and y_train is mismatch' assert X_train.shape[1] == x.shape[0],\ 'The feature quantities of X_train and x do not match' #算出每个data中x与新增x的欧几里和距离 distances = [(sqrt(np.sum(x_train - x)**2)) for x_train in X_train] #排序并用下标表示,从小到大 nearest = np.argsort(distances) #表达式,分别算出第k个下标下的y的原始值 topK_y = [y_train[i] for i in nearest[:k]] #算出这个topK_y中的数据种类和数量 votes = Counter(topK_y) #返回,数量最大的数量种类和数量数量 return votes.most_common(1)[0][0]
import numpy as np from math import sqrt from collections import Counter class KNNClassifier: def __init__(self, k): assert k>=1,'k is small' #赋值 self.k = k self._X_train = None self._y_train = None def fit(self,X_train, y_train): """根据训练数据集X_train 和 y_train训练KNN分类器。 就KNN算法而言其实没有训练fit的过程""" #判断X和y的训练量是否相同 assert X_train.shape[0] == y_train.shape[0], \ 'the data X_train and y_train is mismatch' #判断k是不是小于X_train assert self.k<= X_train.shape[0], \ 'k is big to X_train' self._X_train = X_train self._y_train = y_train return self def predict(self,X_predict): #判断训练量不为空 assert self._X_train is not None and self._y_train is not None ,\ 'No X_train and y_train' #判断新的数据与与测量特征量相同 assert X_predict.shape[1] == self._X_train.shape[1],\ 'X_predict and _X_train 特征 is not same' #调用私有方法,返回预测的y的值 y_predict = [self._predict(x) for x in X_predict] return np.array(y_predict) def _predict(self,x): "给定单个待预测数据x,返回x的预测结果值" assert x.shape[0] ==self._X_train.shape[1] # 算出每个data中x与新增x的欧几里和距离 distances = [(sqrt(np.sum(x_train - x) ** 2)) for x_train in self._X_train] # 排序并用下标表示,从小到大 nearest = np.argsort(distances) # 表达式,分别算出第k个下标下的y的原始值 topK_y = [self._y_train[i] for i in nearest[:self.k]] # 算出这个topK_y中的数据种类和数量 votes = Counter(topK_y) # 返回,数量最大的数量种类和数量数量 return votes.most_common(1)[0][0] def __repr__(self): return "KNN(k=%d)"%self.k
令人惊呀的是,KNN算法竟然没有所谓的模型,在根据训练数据集X_train 和 y_train训练KNN分类器时,其实就KNN算法而言其实没有训练fit的过程,而是做了一个导入参数的过程,然后predict进行计算来得出预测,这样每次一个新的点就要进行一次计算,对于预测值很多的时候,我觉得有点耗费资源,看看后面学习能否得到更合适的机器学习方法。