import pandas as pd import numpy as np import os from imblearn.over_sampling import SMOTE from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.model_selection import train_test_split from sklearn.ensemble import *
os.chdir("../../数据/")
preprocessing()
的具体实现,因为每个人的数据都不一样,而且本文的重点在于混淆矩阵,因此忽略了处理特征的过程。def preprocessing(): ..... ..... return std_data,label
data, label = preprocessing()
x_train,x_test,y_train,y_test = train_test_split(data,label) forest = RandomForestClassifier() forest.fit(x_train,y_train) score = forest.score(x_test,y_test)
score
0.7647058823529411
from sklearn.metrics import confusion_matrix import seaborn as sns
y_true = y_test y_pred = forest.predict(x_test) cm = confusion_matrix(y_true,y_pred)
sns.heatmap(cm,cmap="YlGnBu_r",fmt="d",annot=True)
cm = pd.DataFrame(cm,columns=["cat","dog","lion"],index=["cat","dog","lion"]) sns.heatmap(cm,cmap="YlGnBu_r",fmt="d",annot=True)
从这个图来看,我们可以发现在模型误判的样本中;
16
16
16 和
8
8
8 是较为突出的两组数
这就是说,有 8 8 8 个样本的真实标签是 d o g dog dog 的却被误判成了 l i o n lion lion,而有 16 16 16 个 l i o n lion lion 的样本被误判成了 d o g dog dog
而 c a t cat cat 被误判成 d o g dog dog 和 l i o n lion lion 的样本数量是很小的;因此我们知道应该在 l i o n lion lion 和 d o g dog dog 的判断上进行更多工作