import numpy as np import pandas as pd import os import matplotlib.pyplot as plt from numpy.random import randn np.random.seed(123)
2011 年,短服务商Bitly与美国政府网站 USA.gov 合作,提供从以. gov/. mil结尾的短网址的用户收集的匿名数据。以每小时快照为例,文件中各行的格式为 JSON(即 JavaScript Object Notation,一种常用的 Web 数据格式),该数据集共有十八个维度。若只读取某个文件中的第一行,所看到的结果如下:
path = 'datasets/bitly_usagov/example.txt' open(path).readline() ''' '{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n' '''
# 通过json.loads() 将JSON字符串逐行加载 转换成Python形式,这里为Python字典对象 import json path = 'datasets/bitly_usagov/example.txt' records = [json.loads(line) for line in open(path, encoding='utf-8')] # 查看加载的数据的第一行 records[0] ''' {'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', 'c': 'US', 'nk': 1, 'tz': 'America/New_York', 'gr': 'MA', 'g': 'A6qOVH', 'h': 'wfLQtf', 'l': 'orofrog', 'al': 'en-US,en;q=0.8', 'hh': '1.usa.gov', 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991', 't': 1331923247, 'hc': 1331822918, 'cy': 'Danvers', 'll': [42.576698, -70.954903]} '''
找到数据集中最常出现的时区(tz字段)
# 用列表推导式提取时区列表 # 由于并不是所有的记录都有tz时区数据,故会报错 time_zones = [rec['tz'] for rec in records] ''' --------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-5-f3fbbc37f129> in <module> ----> 1 time_zones = [rec['tz'] for rec in records] <ipython-input-5-f3fbbc37f129> in <listcomp>(.0) ----> 1 time_zones = [rec['tz'] for rec in records] KeyError: 'tz' ''' # 处理以上报错,在列表推导式结尾添加条件 # 但却发现有些时区是空字符串,这些其实也可以过滤掉(这里不做此项处理) time_zones = [rec['tz'] for rec in records if 'tz' in rec] time_zones[:10] ''' ['America/New_York', 'America/Denver', 'America/New_York', 'America/Sao_Paulo', 'America/New_York', 'America/New_York', 'Europe/Warsaw', '', '', ''] '''
纯python下通过定义函数来实现计数
# 定义函数,在遍历时区时用字典来存储计数 def get_counts(sequence): counts = {} for x in sequence: if x in counts: counts[x] += 1 else: counts[x] = 1 return counts # 以上函数的另一种实现方式 # defaultdict()的用法 from collections import defaultdict def get_counts2(sequence): counts = defaultdict(int) # 值将会初始化为0 for x in sequence: counts[x] += 1 return counts
# 传递time_zones列表给刚刚的函数,得到字典 counts = get_counts(time_zones) # 查看tz为'America/New_York'的计数 counts['America/New_York'] '''1251''' # 查看tz字段非缺失值的总的计数,含空值 len(time_zones) '''3440'''
# 定义函数,获取排名前10的时区及其计数 def top_counts(count_dict, n=10): value_key_pairs = [(count, tz) for tz, count in count_dict.items()] value_key_pairs.sort() return value_key_pairs[-n:] # 传递存储时区和计数的字典给刚刚的函数,即可得到最常出现的前10个时区 top_counts(counts) ''' [(33, 'America/Sao_Paulo'), (35, 'Europe/Madrid'), (36, 'Pacific/Honolulu'), (37, 'Asia/Tokyo'), (74, 'Europe/London'), (191, 'America/Denver'), (382, 'America/Los_Angeles'), (400, 'America/Chicago'), (521, ''), (1251, 'America/New_York')] '''
纯python下利用标准库collections.Counter()类实现计数
from collections import Counter # 传递时区列表time_zones给Counter() counts = Counter(time_zones) counts.most_common(10) ''' [('America/New_York', 1251), ('', 521), ('America/Chicago', 400), ('America/Los_Angeles', 382), ('America/Denver', 191), ('Europe/London', 74), ('Asia/Tokyo', 37), ('Pacific/Honolulu', 36), ('Europe/Madrid', 35), ('America/Sao_Paulo', 33)] '''
# 将原始记录的列表传递给pd.DataFrame()生成DataFrame frame = pd.DataFrame(records) frame.info() ''' <class 'pandas.core.frame.DataFrame'> RangeIndex: 3560 entries, 0 to 3559 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 a 3440 non-null object 1 c 2919 non-null object 2 nk 3440 non-null float64 3 tz 3440 non-null object 4 gr 2919 non-null object 5 g 3440 non-null object 6 h 3440 non-null object 7 l 3440 non-null object 8 al 3094 non-null object 9 hh 3440 non-null object 10 r 3440 non-null object 11 u 3440 non-null object 12 t 3440 non-null float64 13 hc 3440 non-null float64 14 cy 2919 non-null object 15 ll 2919 non-null object 16 _heartbeat_ 120 non-null float64 17 kw 93 non-null object dtypes: float64(4), object(14) memory usage: 500.8+ KB ''' # 利用索引切片查看前10行时区数据 frame['tz'][:10] ''' 0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 8 9 Name: tz, dtype: object ''' # 对时区进行计数,用Series 的value_counts() tz_counts = frame['tz'].value_counts() tz_counts[:10] ''' America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 Name: tz, dtype: int64 '''
# 处理缺失值 clean_tz = frame['tz'].fillna('Missing') # 处理空值 clean_tz[clean_tz == ''] = 'Unknown' tz_counts = clean_tz.value_counts() tz_counts[:10] ''' America/New_York 1251 Unknown 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 Name: tz, dtype: int64 ''' # 对处理后的前10名数据进行可视化 import seaborn as sns %matplotlib inline subset = tz_counts[:10] sns.barplot(y=subset.index, x=subset.values)
# a 字段含有执行 URL 短缩操作的浏览器、设备、应用程序的相关信息 frame['a'][1] ''' 'GoogleMaps/RochesterNY' ''' frame['a'][50] ''' 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2' ''' # 选取a字段第52行的数据的前50个字符 frame['a'][51][:50] # long line ''' 'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P9' '''
# 从字段a中解析出感兴趣的信息的做法 # 分离字符串中的第一个标记(大致对应于浏览器信息) # x.split()[0]表示将a字段的信息遇到空白就分开,并选取第一个标记 results = pd.Series([x.split()[0] for x in frame.a.dropna()]) results[:5] ''' 0 Mozilla/5.0 1 GoogleMaps/RochesterNY 2 Mozilla/4.0 3 Mozilla/5.0 4 Mozilla/5.0 dtype: object ''' results.value_counts()[:8] ''' Mozilla/5.0 2594 Mozilla/4.0 601 GoogleMaps/RochesterNY 121 Opera/9.80 34 TEST_INTERNET_AGENT 24 GoogleProducer 21 Mozilla/6.0 5 BlackBerry8520/5.0.0.681 4 dtype: int64 '''
将时区计数多的时区记录分解为Windows和非Windows用户,并统计相同时区下其占比
# 处理缺失的代理字符串,直接将其排除在外 cframe = frame[frame.a.notnull()] # 找出windows用户,并新添加一列'os' cframe['os'] = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows') cframe['os'][:5] ''' 0 Windows 1 Not Windows 2 Windows 3 Not Windows 4 Windows Name: os, dtype: object ''' # 根据时区列及新生成的操作系统列对数据分组 by_tz_os = cframe.groupby(['tz', 'os']) # by_tz_os.size()计算每组的大小 agg_counts = by_tz_os.size().unstack().fillna(0) agg_counts[:10]
os | Not Windows | Windows |
---|---|---|
tz | ||
245.0 | 276.0 | |
Africa/Cairo | 0.0 | 3.0 |
Africa/Casablanca | 0.0 | 1.0 |
Africa/Ceuta | 0.0 | 2.0 |
Africa/Johannesburg | 0.0 | 1.0 |
Africa/Lusaka | 0.0 | 1.0 |
America/Anchorage | 4.0 | 1.0 |
America/Argentina/Buenos_Aires | 1.0 | 0.0 |
America/Argentina/Cordoba | 0.0 | 1.0 |
America/Argentina/Mendoza | 0.0 | 1.0 |
# 得出总体计数最高的时区在原序列中的索引 # agg_counts.sum(axis=1)计算时区总数 # argsort()得出排序后的数据在原序列中的索引 indexer = agg_counts.sum(axis=1).argsort() indexer[-10:] ''' tz Europe/Sofia 35 Europe/Stockholm 78 Europe/Uzhgorod 96 Europe/Vienna 59 Europe/Vilnius 77 Europe/Volgograd 15 Europe/Warsaw 22 Europe/Zurich 12 Pacific/Auckland 0 Pacific/Honolulu 29 dtype: int64 ''' # 用take()方法沿着指定轴返回给定索引处的元素,默认axis=0 count_subset = agg_counts.take(indexer[-10:]) count_subset
os | Not Windows | Windows |
---|---|---|
tz | ||
America/Sao_Paulo | 13.0 | 20.0 |
Europe/Madrid | 16.0 | 19.0 |
Pacific/Honolulu | 0.0 | 36.0 |
Asia/Tokyo | 2.0 | 35.0 |
Europe/London | 43.0 | 31.0 |
America/Denver | 132.0 | 59.0 |
America/Los_Angeles | 130.0 | 252.0 |
America/Chicago | 115.0 | 285.0 |
245.0 | 276.0 | |
America/New_York | 339.0 | 912.0 |
# 可以实现上述结果,但是返回的数据不是原序列中的格式 agg_counts.sum(1).nlargest(10) ''' tz America/New_York 1251.0 521.0 America/Chicago 400.0 America/Los_Angeles 382.0 America/Denver 191.0 Europe/London 74.0 Asia/Tokyo 37.0 Pacific/Honolulu 36.0 Europe/Madrid 35.0 America/Sao_Paulo 33.0 dtype: float64 '''
# 对绘图数据重新排列 count_subset = count_subset.stack() count_subset ''' tz os America/Sao_Paulo Not Windows 13.0 Windows 20.0 Europe/Madrid Not Windows 16.0 Windows 19.0 Pacific/Honolulu Not Windows 0.0 Windows 36.0 Asia/Tokyo Not Windows 2.0 Windows 35.0 Europe/London Not Windows 43.0 Windows 31.0 America/Denver Not Windows 132.0 Windows 59.0 America/Los_Angeles Not Windows 130.0 Windows 252.0 America/Chicago Not Windows 115.0 Windows 285.0 Not Windows 245.0 Windows 276.0 America/New_York Not Windows 339.0 Windows 912.0 dtype: float64 ''' # 给列取名为'total',因为此时前面的为层次化索引 count_subset.name = 'total' # 剔除层次化索引 count_subset = count_subset.reset_index() count_subset[:10]
tz | os | total | |
---|---|---|---|
0 | America/Sao_Paulo | Not Windows | 13.0 |
1 | America/Sao_Paulo | Windows | 20.0 |
2 | Europe/Madrid | Not Windows | 16.0 |
3 | Europe/Madrid | Windows | 19.0 |
4 | Pacific/Honolulu | Not Windows | 0.0 |
5 | Pacific/Honolulu | Windows | 36.0 |
6 | Asia/Tokyo | Not Windows | 2.0 |
7 | Asia/Tokyo | Windows | 35.0 |
8 | Europe/London | Not Windows | 43.0 |
9 | Europe/London | Windows | 31.0 |
# 每个时区分组中,windows用户和非windows用户的数量 sns.barplot(x='total', y='tz', hue='os', data=count_subset)
# 定义函数,计算按时区分组中windows用户和非windows用户的比例;即将组百分比归一化为1 def norm_total(group): group['normed_total'] = group.total / group.total.sum() return group results = count_subset.groupby('tz').apply(norm_total) sns.barplot(x='normed_total', y='tz', hue='os', data=results)
# 以下也可用于将组百分比归一化为1的处理 g = count_subset.groupby('tz') results2 = count_subset.total / g.total.transform('sum')