sklearn.preprocess.LabelEncoder
pd.get_dummies(data)
import pandas as pd data = pd.read_csv("data/car_price.csv") one_matrix = pd.get_dummies(data) one_matrix
sns.pairplot(data=data)
import seaborn as sns import matplotlib.pyplot as plt sns.pairplot(data=df) plt.savefig("pairplot.svg") plt.show()
train_corr=x_train.corr()
# 删除无关数据 x_train = data_price.drop(columns=['car_ID', 'symboling', 'CarName']) train_corr=x_train.corr()
(kmeans.inertia_)
簇内误差平方和# 寻找聚类最优簇的个数 score=[] for n_cluster in range(1,11): kmeans = KMeans(n_cluster) kmeans.fit(x_train) score.append(kmeans.inertia_) plt.plot(range(1,11),score) plt.show
sklearn.metrics.silhouette_score
专门做的笔记传送门
from sklearn.metrics import silhouette_score sc_score = silhouette_score(x, kmeans_model.labels_, metric="euclidean")
sklearn.cluster.AgglomerativeClustering
#使用层次聚类 from scipy.cluster.hierarchy import dendrogram, ward from sklearn.cluster import KMeans, AgglomerativeClustering import matplotlib.pyplot as plt model = AgglomerativeClustering(linkage='ward', n_clusters=3) y = model.fit_predict(train_x) print(y) linkage_matrix = ward(train_x) dendrogram(linkage_matrix) plt.show()
传送门
from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, dendrogram import matplotlib.pyplot as plt %matplotlib inline row_clusters = linkage(pdist(data_copy, metric='euclidean'), method='ward') fig = plt.figure(figsize=(12,10)) # 参数p和参数truncate_mode用来将谱系图截断,部分结点的子树被剪枝,横轴显示的是该结点包含的样本数 row_dendr = dendrogram( row_clusters, p=data_copy.shape[0], truncate_mode='lastp', color_threshold=7, )
GardenLu的实战传送门