1 #计算给定数据集的香农熵 2 from math import log 3 4 def calcShannonEnt(dataSet): 5 numEntries = len(dataSet) #样本条目数 6 labelCounts = {} 7 for featVec in dataSet: 8 currentLabel = featVec[-1] #取每个样本最后一列值 9 labelCounts[currentLabel] = labelCounts.get(currentLabel,0)+1 10 #以上得到字典:{'yes':2,'no':3} 11 shannonEnt = 0.0 12 for key in labelCounts: 13 prob = float(labelCounts[key])/numEntries #求得每个种类的概率 14 shannonEnt -= prob * log(prob,2) #信息熵公式 15 return shannonEnt #返回信息熵 16 ''' 17 if currentLabel not in labelCounts.keys(): #填充字典:以currentLabel为key 18 labelCounts[currentLabel] = 0 19 labelCounts[currentLabel] += 1 #注意缩进 20 #以上得到字典:{'yes':2,'no':3} 21 ''' 22 ''' 23 if currentLabel not in labelCounts.keys(): 24 labelCounts[currentLabel] = 1 25 else: 26 labelCounts[currentLabel] += 1 27 #以上得到字典:{'yes':2,'no':3} 28 ''' 29 def createDataSet(): 30 dataSet = [[1,1,'maybe'], 31 [1,1,'yes'], 32 [1,0,'no'], 33 [0,1,'no'], 34 [0,1,'no']] 35 labels = ['no surfacing','flippers'] 36 return dataSet,labels