K-means算法:
# -*- coding: utf-8 -*- # from sklearn.cluster import KMeans # km = KMeans(n_clusters, init, n_init, max_iter, tol, precompute_distances, verbose, random_state, copy_x, n_jobs, algorithm) # 本代码采用了 Caliski Harabasz 指标衡量聚类情况 # s(k)= (tr(Bk)*(m-k))/(tr(Wk)*(k-1)) # 其中m为训练集样本数,k为类别数。Bk为类别之间的协方差矩阵,Wk为类别内部数据的协方差矩阵。tr为矩阵的迹。 # 类别内部数据的协方差越小越好,类别之间的协方差越大越好,这样的Calinski-Harabasz分数会高. # 评估 from sklearn.metrics import calinski_harabasz_score from sklearn.cluster import KMeans from sklearn.datasets import make_blobs import matplotlib.pyplot as plt # 解决中文显示问题 plt.rcParams['font.sans-serif'] = ['SimSun'] # 指定默认字体 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 # 生成1000个2维的数据集 # centers = [[-2, -2], [0, 0], [2, 2], [3, 3]] # centers = [[-3, 2], [0, 0], [1, 7], [5, 2]] centers = [[-1, -1], [0, 0], [1, 1], [2, 2]] cluster_std = [0.8, 0.4, 0.4, 0.4] # cluster_std = [0.1, 0.05, 0.05, 0.05] # cluster_std = [0.8, 0.4, 0.4, 0.4] X, y = make_blobs(n_samples=1000, n_features=2, centers=centers, cluster_std=cluster_std, random_state=666) # 使用K-Means进行聚类,k = {2, 3, 4, 5, 6, 7} for index, k in enumerate((2, 3, 4, 5, 6, 7)): km = KMeans(n_clusters=k, random_state=666) y_pre = km.fit_predict(X) # 用训练器数据X拟合分类器模型并对训练器数据X进行预测 plt.subplot(2, 3, index+1) # c 为color 不同的聚类用不同的颜色表示其样本。 # X[:, 0]就是在所有数据中取第0维数据 plt.scatter(X[:, 0], X[:, 1], c=y_pre) plt.xticks([]) plt.yticks([]) plt.title('k={},score={:.0f}'.format( k, calinski_harabasz_score(X, y_pre))) plt.show()
谱系聚类算法:
# -*- coding: utf-8 -*- # 本代码采用了 Caliski Harabasz 指标衡量聚类情况 # s(k)= (tr(Bk)*(m-k))/(tr(Wk)*(k-1)) # 其中m为训练集样本数,k为类别数。Bk为类别之间的协方差矩阵,Wk为类别内部数据的协方差矩阵。tr为矩阵的迹。 # 类别内部数据的协方差越小越好,类别之间的协方差越大越好,这样的Calinski-Harabasz分数会高. from sklearn.metrics import calinski_harabasz_score from sklearn.cluster import SpectralClustering from sklearn.datasets import make_blobs import matplotlib.pyplot as plt import time # 解决中文显示问题 plt.rcParams['font.sans-serif'] = ['SimSun'] # 指定默认字体 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 # 生成1000个6维的数据集,分为5个簇 X, y = make_blobs(n_samples=1000, n_features=6, centers=5, cluster_std=[ 1.4, 0.4, 0.3, 0.3, 0.4], random_state=666) plt.subplot(121) plt.scatter(X[:, 0], X[:, 1], c='k') plt.xticks([]) plt.yticks([]) plt.title('样本的原始分布') # 使用Spectral clustering进行聚类: 使用高斯核对n_cluster和gamma进行调参 start = time.process_time() best_score, best_k, best_gamma = 0, 0, 0 for gamma in (0.01, 0.1, 1.5): for k in (3, 4, 5, 6): y_pre = SpectralClustering(n_clusters=k, gamma=gamma).fit_predict(X) score = calinski_harabasz_score(X, y_pre) print('score={:.3f},k={}, gamma={:.4f}'.format(score, k, gamma)) if score > best_score: best_score = score best_gamma = gamma best_k = k print('best_score={:.3f}, best_k={}, best_gamma={:.4f}'.format( best_score, best_k, best_gamma)) end = time.process_time() real_time = end - start print('spending time: {:.3f}s'.format(real_time)) y_pre = SpectralClustering(n_clusters=best_k, gamma=best_gamma).fit_predict(X) plt.subplot(122) plt.scatter(X[:, 0], X[:, 1], c=y_pre) plt.xticks([]) plt.yticks([]) plt.title('k={},score={:.3f}'.format(best_k, best_score)) plt.show()