将序列转换为计数字典{元素: 频度}
,然后根据频度排序。
1、使用 dict.fromkeys()
构造计数字典
from random import randint # 创建一个随机列表 L = [randint(0, 20) for _ in range(30)] print(L) # 创建一个所有key初始值为0的字典 d = dict.fromkeys(L, 0) print(d) # {20: 0, 3: 0, 9: 0, 7: 0, 6: 0, 14: 0, 8: 0, 19: 0, 15: 0, 18: 0, 12: 0, 4: 0, 17: 0, 5: 0, 1: 0, 0: 0, 2: 0} # 统计频度 for i in L: d[i] += 1 print(d) # {20: 2, 3: 3, 9: 2, 7: 3, 6: 1, 14: 2, 8: 2, 19: 2, 15: 2, 18: 1, 12: 1, 4: 2, 17: 2, 5: 2, 1: 1, 0: 1, 2: 1}
2、使用 dict.setdefault()
构造计数字典
from random import randint L = [randint(0, 20) for _ in range(30)] d = {} for i in L: d[i] = d.setdefault(i, 0) + 1 print(d) # {13: 2, 14: 2, 9: 2, 8: 3, 1: 2, 5: 2, 7: 1, 20: 2, 10: 3, 0: 2, 18: 1, 4: 1, 3: 1, 17: 2, 16: 1, 12: 2, 11: 1}
3、使用 heapq.nlargest(n, iterable, key=None)
进行频度统计
Equivalent to:
sorted(iterable, key=key, reverse=True)[:n]
from random import randint import heapq # 根据频度进行排序,并取出排名前3个 L = [randint(0, 20) for _ in range(30)] d = {} for i in L: d[i] = d.setdefault(i, 0) + 1 s = sorted(d.items(), key=lambda x: x[1], reverse=True)[:3] print(s) # [(16, 4), (19, 3), (13, 3)] # 使用堆,取出排名前3个 r = heapq.nlargest(3, d.items(), key=lambda x: x[1]) print(r) # [(16, 4), (19, 3), (13, 3)]
4、使用 Counter
进行频度统计
这个算是频度统计最简单的姿势了,无需手动构造计数字典,可以直接操作一个可迭代对象。
from collections import Counter c1 = Counter() # a new, empty counter c2 = Counter('gallahad') # a new counter from an iterable c3 = Counter({'red': 4, 'blue': 2}) # a new counter from a mapping c4 = Counter(cats=4, dogs=8) # a new counter from keyword args
from random import randint from collections import Counter L = [randint(0, 20) for _ in range(30)] c = Counter(L) print(c) # Counter({16: 4, 19: 3, 13: 3, 3: 3, 1: 2, 18: 2, 14: 2, 10: 2, 9: 2, 4: 2, 7: 1, 20: 1, 15: 1, 5: 1, 8: 1}) # 使用most_common()方法获取topN,这里其实是基于heapq实现的 r = c.most_common(3) print(r) # [(16, 4), (19, 3), (13, 3)] # 更新Counter,合并统计 c2 = Counter(L) c.update(c2) print(c) # Counter({16: 8, 19: 6, 13: 6, 3: 6, 1: 4, 18: 4, 14: 4, 10: 4, 9: 4, 4: 4, 7: 2, 20: 2, 15: 2, 5: 2, 8: 2})
from collections import Counter import re # 词频统计,取出前5 with open('example.txt') as f: txt = f.read() w = re.split('\W+', txt) print(w) c2 = Counter(w) r = c2.most_common(5) print(r) # [('a', 21), ('the', 16), ('to', 15), ('and', 12), ('Service', 8)]
参考文档