接下来,从sklearn库中依次导人三个分类器模块:
import pandas as pd import numpy as np from sklearn.impute import SimpleImputer # 导入预处理模块Imputer from sklearn.model_selection import train_test_split # 导入自动生成训练集和测试集的模块train_test_split from sklearn.metrics import classification_report # 导入预测结果评估模块classification_report # 导入分类器 from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB
def load_datasets(feature_paths, label_paths): # ''' # 读取特征文件和标签文件并返回 # ''' # 定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1 feature = np.ndarray(shape=(0, 41)) label = np.ndarray(shape=(0, 1)) for file in feature_paths: # 使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行 df = pd.read_table(file, delimiter=',', na_values='?', header=None) # 使用SimpleImputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。 imp = SimpleImputer(missing_values='NaN', strategy='mean', axis=0) # fit()函数用于训练预处理器,transform()函数用于生成预处理结果。 imp.fit(df) df = imp.transform(df) # 将预处理后的数据加入feature,依次遍历完所有特征文件 feature = np.concatenate((feature, df)) # 读取标签文件 for file in label_paths: df = pd.read_table(file, header=None) label = np.concatenate((label, df)) # 将标签归整化为一维向量 label = np.ravel(label) return feature, label
if __name__ == '__main__': # ''' 数据路径 ''' featurePaths = ['A/A.feature', 'B/B.feature', 'C/C.feature', 'D/D.feature', 'E/E.feature'] labelPaths = ['A/A.label', 'B/B.label', 'C/C.label', 'D/D.label', 'E/E.label'] # ''' 读入数据 ''' x_train, y_train = load_datasets(featurePaths[:4], labelPaths[:4]) x_test, y_test = load_datasets(featurePaths[4:], labelPaths[4:]) # 使用train_test_split()函数,通过设置测试集比例test_size为0,将数据随机打乱,便于后续分类器的初始化和训练。 x_train, x_, y_train, y_ = train_test_split(x_train, y_train, test_size=0.0) print('Start training knn') knn = KNeighborsClassifier().fit(x_train, y_train) print('Training done') answer_knn = knn.predict(x_test) print('Prediction done') print('Start training DT') dt = DecisionTreeClassifier().fit(x_train, y_train) print('Training done') answer_dt = dt.predict(x_test) print('Prediction done') print('Start training Bayes') gnb = GaussianNB().fit(x_train, y_train) print('Training done') answer_gnb = gnb.predict(x_test) print('Prediction done') print('\n\nThe classification report for knn:') print(classification_report(y_test, answer_knn)) print('\n\nThe classification report for DT:') print(classification_report(y_test, answer_dt)) print('\n\nThe classification report for Bayes:') print(classification_report(y_test, answer_gnb))
Start training knn Training done Prediction done Start training DT Training done Prediction done Start training Bayes Training done Prediction done The classification report for knn: precision recall f1-score support 0.0 0.56 0.60 0.58 102341 1.0 0.92 0.93 0.93 23699 2.0 0.94 0.78 0.85 26864 3.0 0.83 0.82 0.82 22132 4.0 0.85 0.88 0.87 32033 5.0 0.39 0.21 0.27 24646 6.0 0.77 0.89 0.82 24577 7.0 0.80 0.95 0.87 26271 12.0 0.32 0.33 0.33 14281 13.0 0.16 0.22 0.19 12727 16.0 0.90 0.67 0.77 24445 17.0 0.89 0.96 0.92 33034 24.0 0.00 0.00 0.00 7733 avg / total 0.69 0.69 0.68 374783 The classification report for DT: precision recall f1-score support 0.0 0.48 0.73 0.58 102341 1.0 0.66 0.96 0.78 23699 2.0 0.84 0.86 0.85 26864 3.0 0.93 0.72 0.81 22132 4.0 0.23 0.16 0.19 32033 5.0 0.62 0.52 0.57 24646 6.0 0.76 0.57 0.65 24577 7.0 0.32 0.15 0.20 26271 12.0 0.60 0.67 0.63 14281 13.0 0.67 0.47 0.56 12727 16.0 0.57 0.07 0.13 24445 17.0 0.84 0.85 0.85 33034 24.0 0.38 0.29 0.33 7733 avg / total 0.59 0.59 0.56 374783 The classification report for Bayes: precision recall f1-score support 0.0 0.62 0.81 0.70 102341 1.0 0.97 0.91 0.94 23699 2.0 1.00 0.65 0.79 26864 3.0 0.60 0.66 0.63 22132 4.0 0.91 0.77 0.83 32033 5.0 1.00 0.00 0.00 24646 6.0 0.87 0.72 0.79 24577 7.0 0.31 0.47 0.37 26271 12.0 0.52 0.59 0.55 14281 13.0 0.61 0.50 0.55 12727 16.0 0.89 0.72 0.79 24445 17.0 0.75 0.91 0.82 33034 24.0 0.59 0.24 0.34 7733 avg / total 0.74 0.68 0.67 374783
结果对比:
结论:
一些视频教学里的错误:
1、明明from sklearn.cross_validation import train_test_split在一台电脑上运行没有任何问题,但是换了一台电脑就出现如下图所示的问题,显示没有该模块。
解决方法:
将from sklearn.cross_validation import train_test_split替换为from sklearn.model_selection import train_test_split
2、from sklearn.preprocessing import Imputer更新了sklearn库,但发现有报错,原来处理缺失值的Imputer没了,按理说那么重要的库不大可能没有,然后看了下文档,0.21之后就从preprocessing里单独抽出来了,
解决方法:
将代码修改为from sklearn.impute import SimpleImputer
最后,可能会出现axis的错误:
把imp = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0),改成imp = SimpleImputer(missing_values=np.nan, strategy='mean')