from sklearn.datasets import load_iris, load_wine from sklearn import tree from sklearn,model_selection import train_test_split from sklearn.metrics import accuracy_score import pandas as pd import numpy as np import matplotlib.pyplot as plt
X, y = load_wine(as_frame=True, return_X_y=True) train_X, valid_X, train_y, valid_y = sklearn.model_selection.train_test_split(X, y, test_size=0.3)
这里的as_frame
好像只有在sklearn 1.0 以后才实现了。
clf = tree.DecisionTreeClassifier(criterion='entropy' , random_state=30 , splitter='random' , max_depth=3) clf = clf.fit(train_X, train_y) clf.score(valid_X, valid_y)
这里的score是accuracy_score。返回的是准确率,计算公式:
\[A C C=\frac{T P+T N}{T P+T N+F P+F N} \]准确度:0.9444444444444444
好像还不错,再来个交叉验证。
from sklearn.model_selection import cross_val_score cross_val_score(clf, X, y, cv=5).mean()
acc: 0.9273015873015874
嗯,可以的。
data_url = "http://lib.stat.cmu.edu/datasets/boston" raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) target = raw_df.values[1::2, 2] reg = tree.DecisionTreeRegressor(max_depth=5 ,random_state=30) cross_val_score(reg, data, target, scoring='neg_mean_squared_error', cv=5)
原本的sklearn使用的是\(R2\)作为评判标准。
\[\begin{gathered} R^{2}=1-\frac{u}{v} \\ u=\sum_{i=1}^{N}\left(f_{i}-y_{i}\right)^{2} \quad v=\sum_{i=1}^{N}\left(y_{i}-\hat{y}\right)^{2} \end{gathered} \]R2越接近1模型拟合越好。
可以看到模型在有的交叉验证上,效果很好,有的却较差。
import graphviz dot_data = tree.export_graphviz(clf, out_file=None, filled=True) graph = graphviz.Source(dot_data) graph
节点越纯净,颜色越深。
test = [] for i in range(10): clf = tree.DecisionTreeClassifier(criterion='entropy' , random_state=30 , splitter='random' , max_depth=i+1) clf = clf.fit(train_X, train_y) sc = clf.score(valid_X, valid_y) test.append(sc) plt.plot(test)
all_data = pd.read_csv('./data.csv') all_data.info()
'Ticket', 'Cabin', 'Name'
这三个属性不仅缺失较多,而且对于预测没啥帮助。
可以都删了。
all_data.drop(['Ticket', 'Cabin', 'Name'], inplace=True, axis=1) all_data.Age.fillna(all_data.Age.mean(), inplace=True) #年龄填充均值 all_data.info()
对于离散的Object直接转为one_hot
all_data = pd.get_dummies(all_data)
x = all_data.iloc[:, all_data.columns != 'Survived'] y = all_data.iloc[:, all_data.columns =='Survived'] train_X, valid_X, train_y, valid_y = train_test_split(x,y, test_size=0.3) clf = tree.DecisionTreeClassifier(criterion='entropy' , random_state=30 , max_depth=4 , min_samples_leaf=5 , min_samples_split=5 ) clf.fit(train_X, train_y) clf.score(valid_X, valid_y)
acc: 0.8246268656716418
交叉验证
cross_val_score(clf, x, y, cv=5).mean()
acc: 0.7991337643587973
from sklearn.model_selection import GridSearchCV gini_thresholds = np.linesapce(0, 0.5, 20) parameters = {'splitter':('best','random') ,'criterion':("gini","entropy") ,"max_depth":[*range(1,10)] ,'min_samples_leaf':[*range(1,50,5)] ,'min_impurity_decrease':[*np.linspace(0,0.5,20)] } clf = tree.DecisionTreeClassifier(random_state=25) GS = GridSearchCV(clf, parameters, cv=10) GS.fit(train_X,train_y) GS.best_params_ GS.best_score_
{'criterion': 'entropy', 'max_depth': 3, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 16, 'splitter': 'best'}
0.8186123911930363