Task08
本次学习参照Datawhale开源学习:https://github.com/datawhalechina/machine-learning-toy-code/tree/main/ml-with-sklearn
内容安排如下,主要是一些代码实现和部分原理介绍。
机器学习可视化有助于我们分析模型效果、理解模型原理、对比模型优劣。本章将介个不同的机器学习任务分别对回归、分类、聚类给出可视化实例。
直接用matplotlib画出线性回归直线:
import numpy as np def true_fun(X): return 1.5*X + 0.2 np.random.seed(0) # 设置随机种子 n_samples = 30 # 设置采样数据点的个数 X_train = np.sort(np.random.rand(n_samples)) y_train = (true_fun(X_train) + np.random.randn(n_samples) * 0.05).reshape(n_samples,1) from sklearn.linear_model import LinearRegression # 导入线性回归模型 model = LinearRegression() # 定义模型 model.fit(X_train[:,np.newaxis], y_train) # 训练模型 import matplotlib.pyplot as plt X_test = np.linspace(0, 1, 100) plt.plot(X_test, model.predict(X_test[:, np.newaxis]), label="Model") plt.plot(X_test, true_fun(X_test), label="True function") plt.scatter(X_train,y_train) # 画出训练集的点 plt.legend(loc="best") plt.show()
matplotlib.pyplot contourf()函数可以用来画决策边界填充轮廓线:
import numpy as np import matplotlib.pyplot as plt from sklearn import svm data = np.array([ [0.1, 0.7], [0.3, 0.6], [0.4, 0.1], [0.5, 0.4], [0.8, 0.04], [0.42, 0.6], [0.9, 0.4], [0.6, 0.5], [0.7, 0.2], [0.7, 0.67], [0.27, 0.8], [0.5, 0.72] ]) label = [1] * 6 + [0] * 6 x_min, x_max = data[:, 0].min() - 0.2, data[:, 0].max() + 0.2 y_min, y_max = data[:, 1].min() - 0.2, data[:, 1].max() + 0.2 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.002), np.arange(y_min, y_max, 0.002)) # meshgrid如何生成网格 model_linear = svm.SVC(kernel='linear', C = 0.001) model_linear.fit(data, label) # 训练 Z = model_linear.predict(np.c_[xx.ravel(), yy.ravel()]) # 预测 Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap = plt.cm.ocean, alpha=0.6) # 决策填充轮廓线 plt.scatter(data[:6, 0], data[:6, 1], marker='o', color='r', s=100, lw=3) plt.scatter(data[6:, 0], data[6:, 1], marker='x', color='k', s=100, lw=3) plt.title('Linear SVM') plt.show()
sklearn.tree.plot_tree()可以用来决策树可视化:
'''对鸢尾花数据集进行分类任务''' import pandas as pd import numpy as np from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn import tree import matplotlib.pyplot as plt # 导入数据集 data = load_iris() df = pd.DataFrame(data.data, columns = data.feature_names) # 分类标签 df['Species'] = data.target target = np.unique(data.target) target_names = np.unique(data.target_names) targets = dict(zip(target, target_names)) df['Species'] = df['Species'].replace(targets) # 划分特征和标签 x = df.drop(columns="Species") y = df["Species"] feature_names = x.columns labels = y.unique() # 划分训练集和测试集 X_train, test_x, y_train, test_lab = train_test_split(x,y,test_size = 0.4,random_state = 42) # 使用决策树训练模型 model = DecisionTreeClassifier(max_depth =3, random_state = 42) model.fit(X_train, y_train) # 决策树可视化 plt.figure(figsize=(30,10), facecolor ='g') a = tree.plot_tree(model,feature_names = feature_names,class_names = labels,rounded = True,filled = True,fontsize=14) plt.show()
画出聚类质心:
'''生成数据''' from sklearn.datasets import make_blobs import matplotlib.pyplot as plt X, y = make_blobs(n_samples=1000, # 1000个样本 n_features=2, # 每个样本2个特征(2维数据) centers=5, # 5个簇中心 random_state=42) fig, ax=plt.subplots(1) '''聚类''' from sklearn.cluster import KMeans n_clusters=5 cluster = KMeans(n_clusters=5,random_state=0).fit(X) y_pred = cluster.fit_predict(X) centroid=cluster.cluster_centers_ # 聚类质心 inertia=cluster.inertia_ '''画出聚类质心''' fig, ax=plt.subplots(1) for i in range(n_clusters): ax.scatter(X[y_pred==i, 0], X[y_pred==i, 1], marker='o', s=8) ax.scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black')