import numpy as np import matplotlib.pyplot as plt import sklearn.datasets as ds import matplotlib.colors from sklearn.cluster import KMeans,MiniBatchKMeans def expand(a,b): d=(b-a)*0.1 return a-d,b+d if __name__ == '__main__': N=400 #创建400个样本 centers=4 #创建4组数据 # n_features=2 样本含有2个维度 centers这样样本分布到4个簇中 # 返回值 data 数据,y的含义是 因为我们是通过ds创建的数据 其实是知道数据在那个标签中的 data,y=ds.make_blobs(N,n_features=2,centers=centers,random_state=2) # 指定四个组的方差创建数据 data2,y2 = ds.make_blobs(N, n_features=2, centers=centers,cluster_std=(1,2.5,0.5,2),random_state=2) #把第一组数据的所有数据 第二组数据到额前50个 第三组数据的前20个 第四组数据的前5个 data3=np.vstack((data[y==0][:],data[y==1][:50],data[y==2][:20],data[y==3][:5])) # 为拿出来的data3 数据打标签 y3=np.array([0]*100+[1]*50+[2]*20+[3]*5) # 创建KMeans 对象 cls=KMeans(n_clusters=4,init='k-means++') #初始 y_hat=cls.fit_predict(data) #训练并与预测data y2_hat = cls.fit_predict(data2) #训练并与预测data2 y3_hat = cls.fit_predict(data3) #训练并与预测data3 m=np.array(((1,1),(1,3))) #创建一个矩阵 data_r=data.dot(m) #用创建的矩阵对data进行线性变换 y_r_hat=cls.fit_predict(data_r) #训练并与预测data_r matplotlib.rcParams['font.sans-serif']=[u'SimHei'] #可以显示中文 matplotlib.rcParams['axes.unicode_minus'] = False cm=matplotlib.colors.ListedColormap(list('rgbm')) plt.figure(figsize=(9,10),facecolor='w') plt.subplot(421) plt.title(u'原始数据') plt.scatter(data[:,0],data[:,1],c=y,s=30,cmap=cm,edgecolors='none') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.subplot(422) plt.title(u'KMeans++聚类') plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none') plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.subplot(423) plt.title(u'旋转后数据') plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=30, cmap=cm, edgecolors='none') x1_min, x2_min = np.min(data_r, axis=0) x1_max, x2_max = np.max(data_r, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.subplot(424) plt.title(u'旋转后KMeans++聚类') plt.scatter(data_r[:, 0], data_r[:, 1], c=y_r_hat, s=30, cmap=cm, edgecolors='none') plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.subplot(425) plt.title(u'方差不相等数据') plt.scatter(data2[:, 0], data2[:, 1], c=y2, s=30, cmap=cm, edgecolors='none') x1_min, x2_min = np.min(data2, axis=0) x1_max, x2_max = np.max(data2, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.subplot(426) plt.title(u'方差不相等KMeans++聚类') plt.scatter(data2[:, 0], data2[:, 1], c=y2_hat, s=30, cmap=cm, edgecolors='none') plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.subplot(427) plt.title(u'数量不相等数据') plt.scatter(data3[:, 0], data3[:, 1], s=30, c=y3, cmap=cm, edgecolors='none') x1_min, x2_min = np.min(data3, axis=0) x1_max, x2_max = np.max(data3, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.subplot(428) plt.title(u'数量不相等KMeans++聚类') plt.scatter(data3[:, 0], data3[:, 1], c=y3_hat, s=30, cmap=cm, edgecolors='none') plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.tight_layout(2, rect=(0, 0, 1, 0.97)) plt.suptitle(u'数据分布对KMeans聚类的影响', fontsize=18) # https://github.com/matplotlib/matplotlib/issues/829 # plt.subplots_adjust(top=0.92) plt.show() # plt.savefig('cluster_kmeans')
# !/usr/bin/python # -*- coding:utf-8 -*- from sklearn import metrics if __name__ == "__main__": y = [0, 0, 0, 1, 1, 1] y_hat = [0, 0, 1, 1, 2, 2] h = metrics.homogeneity_score(y, y_hat) c = metrics.completeness_score(y, y_hat) print(u'同一性(Homogeneity):', h) print(u'完整性(Completeness):', c) v2 = 2 * c * h / (c + h) v = metrics.v_measure_score(y, y_hat) print(u'V-Measure:', v2, v) y = [0, 0, 0, 1, 1, 1] y_hat = [0, 0, 1, 3, 3, 3] h = metrics.homogeneity_score(y, y_hat) c = metrics.completeness_score(y, y_hat) v = metrics.v_measure_score(y, y_hat) print(u'同一性(Homogeneity):', h) print(u'完整性(Completeness):', c) print(u'V-Measure:', v) # 允许不同值 y = [0, 0, 0, 1, 1, 1] y_hat = [1, 1, 1, 0, 0, 0] h = metrics.homogeneity_score(y, y_hat) c = metrics.completeness_score(y, y_hat) v = metrics.v_measure_score(y, y_hat) print(u'同一性(Homogeneity):', h) print(u'完整性(Completeness):', c) print(u'V-Measure:', v) y = [0, 0, 1, 1] y_hat = [0, 1, 0, 1] ari = metrics.adjusted_rand_score(y, y_hat) print(ari) y = [0, 0, 0, 1, 1, 1] y_hat = [0, 0, 1, 1, 2, 2] ari = metrics.adjusted_rand_score(y, y_hat) print(ari)
结果
同一性(Homogeneity): 0.6666666666666669 完整性(Completeness): 0.420619835714305 V-Measure: 0.5158037429793889 0.5158037429793889 同一性(Homogeneity): 1.0 完整性(Completeness): 0.6853314789615865 V-Measure: 0.8132898335036762 同一性(Homogeneity): 1.0 完整性(Completeness): 1.0 V-Measure: 1.0 -0.5 0.24242424242424243 Process finished with exit code 0
• DBSCAN(Density-Based Spatial Clustering of Applications with Noise)
• 一个基于密度聚类的算法 ,与层次聚类不同 ,它将簇定义为密度相连的点的最大集合,能够把具 有高密度的区域划分为簇,并可有效地对抗噪声 问:将簇定义为密度相连的点的最大集合? 概念一:直接密度可达: • 对象的e邻域:给定对象在半径e内的区域; • 核心对象 :给定一个数目m,如果对象的e邻域中有至少m个对象, 该对象为核心对象 • 直接密度可达:给定一个对象集合D, 如果p在q的e邻域内,而q是一个核心对象,p从q 出发是直接密度可达的
• 密度可达:如果存在一个对象链p1p2…pn ,令p1=p,pn=q,pi+1是关于e和m直接密度可达的,则对象p是从对象q关于e和m密度可达的。
• 密度相连:如果集合D中存在一个对象o,使o->p 密度可达,o->q 密度可达,那么p和q就是关 于e和m密度相连的
# !/usr/bin/python # -*- coding:utf-8 -*- import numpy as np import matplotlib.pyplot as plt import sklearn.datasets as ds import matplotlib.colors from sklearn.cluster import DBSCAN from sklearn.preprocessing import StandardScaler def expand(a, b): d = (b - a) * 0.1 return a-d, b+d if __name__ == "__main__": N = 1000 centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]] data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0) data = StandardScaler().fit_transform(data) # 数据的参数:(epsilon, min_sample) #epsilon 半径大小 min_sample 在半径内最小的样本个数,用于判断是否是核心对象 params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15)) matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(12, 8), facecolor='w') plt.suptitle(u'DBSCAN聚类', fontsize=20) for i in range(6): eps, min_samples = params[i] model = DBSCAN(eps=eps, min_samples=min_samples) model.fit(data) y_hat = model.labels_ core_indices = np.zeros_like(y_hat, dtype=bool) core_indices[model.core_sample_indices_] = True y_unique = np.unique(y_hat) n_clusters = y_unique.size - (1 if -1 in y_hat else 0) print(y_unique, '聚类簇的个数为:', n_clusters) plt.subplot(2, 3, i+1) clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size)) print(clrs) for k, clr in zip(y_unique, clrs): cur = (y_hat == k) if k == -1: plt.scatter(data[cur, 0], data[cur, 1], s=20, c='k') continue plt.scatter(data[cur, 0], data[cur, 1], s=30, c=clr, edgecolors='k') plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=60, c=clr, marker='o', edgecolors='k') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.title(u'epsilon = %.1f m = %d,聚类数目:%d' % (eps, min_samples, n_clusters), fontsize=16) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()