以数据的前两项特征对数据进行划分得到以下散点图:
import numpy as np def read_data(path): lines=path.readlines() data=[] label=[] for line in lines: line=line.split() data.append(list(map(float,line[0:3]))) if line[-1]=='didntLike': label.append(3) elif line[-1]=='largeDoses': label.append(2) else: label.append(1) return np.array(data),np.array(label) def normalized(data): mindata=data.min(0) def standdata(traindata): meandata0 = np.mean(traindata,axis=0) stddata0 = np.std(traindata,axis=0) length = traindata.shape[0] meandata1 = np.tile(meandata0,(length,1)) stddata1 = np.tile(stddata0,(length,1)) standdata = (traindata-meandata1)/stddata1 return standdata, meandata0, stddata0 def autoNorm(x): """ 最大值最小值归一化 :param x: 需要归一化的特征向量 :return: 新的数组、极差、最小值 """ minVals=x.min(axis=0) maxVals=x.max(axis=0) ranges=maxVals-minVals x_new=(x-minVals)/ranges # 广播 return x_new,ranges,minVals def knn(traindata,testdata,label,k): distance=np.sqrt(np.sum((traindata-testdata)**2,axis=1)) p=distance.argsort() vote = [0, 0, 0] for i in range(k): vote[label[p[i]]-1]=vote[label[p[i]]-1]+1 return vote.index(max(vote))+1 def testknn(data,label,k): # per=np.random.permutation(np.shape(data)[0]) # new_data=data[per,:] # new_label=label[per] train_data=data[0:int(np.shape(data)[0]*0.9)] train_label=label[0:int(np.shape(data)[0]*0.9)] test_data=data[int(np.shape(data)[0]*0.9):] test_label = label[int(np.shape(data)[0] * 0.9):] true_label=0 for i in range(len(test_label)): result_a=knn(train_data,test_data[i],train_label,k) if result_a==test_label[i]: true_label=true_label+1 print(result_a,test_label[i]) acc=float(true_label)/len(test_data) return acc if __name__ == '__main__': path='Knn_Helen' true_label=["smallDoses",'largeDoses','didntLike'] file=open(path,'r') print('=======') data,label=read_data(file) a,b,c=autoNorm(data) acc=testknn(a,label,25) print(acc)
准确率能达到95%以上