提示:本文不调用sklearn等包,直接使用numpy和pandas完成了Hierarchical Clustering,即层次聚类算法的实现。
代码如下(示例):
# -*- coding: utf-8 -*- import pandas as pd import numpy as np class HC(object): def __init__(self, data,categorynums): self.Clustering_Data = data self.hc = [] self.cnt = 0 self.categorynums = categorynums self.start() def indmin_matrix(self, M): row, col = divmod(np.argmin(M), np.shape(M)[1]) return row, col def em(self, A, B): efunc = lambda a, b: np.power(float(a) - float(b), 2) func = np.frompyfunc(efunc, 2, 1) em = np.sqrt(sum(func(A, B))) return em def AverageLinkage(self, A, B): total = 0.0 for i in A: for j in B: total += self.em(i, j) ret = total / (np.shape(A)[0] * np.shape(B)[0]) return ret def start(self): self.cnt += 1 print('\n\n===================%d times Hierarical Clustring================' % self.cnt) # 首次进行算法,要初始化结果数组 if 0 == np.shape(self.hc)[0]: initData = [[i] for i in range(np.shape(self.Clustering_Data)[0])] self.hc = [initData] print('init self.hc:', self.hc) preHC, n = self.hc[-1], np.shape(self.hc[-1])[0] print('preHC:', preHC) # 剩下的集合数量为categorynums时停止聚类 if self.categorynums == n: print('succeed hierarical clustring:\n', ) for i in range(np.shape(self.hc)[0]): print(self.hc[i]) return self.hc # 继续聚类 dist = np.full(shape=(n, n), fill_value=np.inf) value = np.array(self.Clustering_Data)[:, -1] for i in range(n): for j in np.arange(start=i + 1, stop=n, step=1): A, B = value[preHC[i]], value[preHC[j]] dist[i, j] = self.AverageLinkage(A, B) print('dist:\n', dist) # 更新聚类结果 row, col = self.indmin_matrix(dist) C = [] newHC = [] for i in range(n): if row == i or col == i: if np.shape(C)[0] == 0: C = preHC[row] + preHC[col] newHC.append(C) continue newHC.append(preHC[i]) # 更新HC结果数组 self.hc.append(newHC) # for i in range(np.shape(self.hc)[0]): # print(self.hc[i]) return self.start() if __name__ == '__main__': #n是采样数,建议值为100以下,否则运算太慢 df = pd.read_csv('../../../../Tencent/2668630468/FileRecv/BWGHT.csv') df = df.fillna(0) df = df.reset_index() temp = np.linspace(0, df.shape[0], num=df.shape[0], endpoint=False) df['index'] = pd.DataFrame(temp, columns=['index']) data = [] for i in range(df.shape[0]): templist = [] linelist = list(df.loc[i][:].values) templist.append([linelist[-1]]) templist.append(linelist[0:-2]) data.append(templist) # 5是聚类结果的类别数量,可以自行设定 hc = HC(data,5)
CSDN数据链接如下:数据链接
本文使用numpy完成了层次聚类的实现,层次聚类虽然简单但是复杂度极高,不过也可以加以优化,通过记录上次迭代的距离矩阵,避免更新一整个距离矩阵,速度会快上一些。