1、groupby
df = pd.DataFrame({'key1':['a','a','b','b','a'], 'key2':['one','two','one','two','one'], 'data1':np.random.randn(5), 'data2':np.random.randn(5)}) df['data1'].groupby(df['key1']).mean() #按key1分组,并计算data1的均值 means =df['data1'].groupby([df['key1'],df['key2']]).mean() #传入多个数组进行分组 means.unstack() #分组键均为Series。实际上分组键可以是任何长度适当的数组 states = np.array(['Ohio','California','California','Ohio','Ohio']) years = np.array([2005,2005,2005,2005,2006]) df['data1'].groupby([states,years]).mean() #还可以将列名(可以是字符串、数字或其他python对象)用作分组键 df.groupby('key1').mean() df.groupby(['key1','key2']).mean() #size方法返回一个含有分组大小的Series df.groupby(['key1','key2']).size()
2、对分组进行迭代
1 #产生一组二元元组(由分组名和数据块组成) 2 for name ,group in df.groupby('key1'): 3 print(name) 4 print(group) 5 ’‘’ 6 a 7 key1 key2 data1 data2 8 0 a one 0.266786 0.430996 9 1 a two 0.168590 -0.488668 10 4 a one 0.585086 -0.215946 11 b 12 key1 key2 data1 data2 13 2 b one -1.239376 -0.860125 14 3 b two -0.022342 -0.169070 15 ‘’‘ 16 for (k1, k2) ,group in df.groupby(['key1','key2']): 17 print((k1,k2)) 18 print(group) 19 20 #将分组数据变为字典 21 pieces = dict(list(df.groupby('key1'))) 22 pieces['b']
3、选取一个或者一组列
1 #下面两者是等价的 2 df.groupby('key1')['data1'] #返回的是已经分组的SeriesGroupBy 3 df['data1'].groupby(df['key1']) 4 5 df.groupby('key1')[['data1']] #返回的是已经分组的DataFrameGroupBy 6 df[['data1']].groupby(df['key1']) 7 8 9 df.groupby(['key1','key2'])[['data1']].mean() 10 11 12 #通过字典或者Series进行分组 13 people =pd.DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis']) 14 mapping ={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'} 15 16 by_column = people.groupby(mapping,axis=1) 17 map_series =pd.Series(mapping) 18 people.groupby(map_series,axis=1).count()View Code
4、通过函数进行分组
people.groupby(len).sum()
5、根据索引级别进行分组
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['city','tenor']) hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns) hier_df.groupby(level='city',axis=1).count()
6、聚合
#自定义聚合函数 def peak_to_peak(arr): return arr.max() - arr.min() grouped.agg(peak_to_peak) #将定义的函数传入agg方法即可 #面向列的多函数应用 tips = pd.read_csv('examples/tips.csv') tips.head() ''' total_bill tip smoker day time size tip_pct 0 16.99 1.01 No Sun Dinner 2 0.059447 1 10.34 1.66 No Sun Dinner 3 0.160542 2 21.01 3.50 No Sun Dinner 3 0.166587 3 23.68 3.31 No Sun Dinner 2 0.139780 4 24.59 3.61 No Sun Dinner 4 0.146808 ''' #根据day、smoker对数据tips进行分组 grouped = tips.groupby(["day", "smoker"]) grouped_pct = grouped["tip_pct"] #将函数名以字符串形式传入 grouped_pct.agg('mean') #传入一组函数名得到的列就以相应的函数名命名 grouped_pct.agg(["mean", "std", peak_to_peak]) #传入以(name,function)为元组组成的列表,各元组的第一个名字就被作为列名 grouped_pct.agg([('foo','mean'),('bar',np.std)]) ''' foo bar day smoker Fri No 0.151650 0.028123 Yes 0.174783 0.051293 Sat No 0.158048 0.039767 Yes 0.147906 0.061375 Sun No 0.160113 0.042347 Yes 0.187250 0.154134 Thur No 0.160298 0.038774 Yes 0.163863 0.039389 ''' #可以定义一组应用于全部列的函数 functions = ["count", "mean", "max"] result = grouped1[["tip_pct", "total_bill"]].agg(functions) #可以传入带有自定义名称的元组列表 ftuples = [("Average", "mean"), ("Variance", np.var)] grouped[["tip_pct", "total_bill"]].agg(ftuples) #不同列应用不同函数 grouped.agg({"tip" : np.max, "size" : "sum"}) grouped.agg({"tip_pct" : ["min", "max", "mean", "std"],"size" : "sum"})
7、”无索引“形式返回聚合数据
tips.groupby(["day", "smoker"], as_index=False).mean()
8、分组级运算和转换
k1_means = df.groupby('key1').mean().add_prefix('mean_') ''' mean_data1 mean_data2 key1 a 0.340154 -0.091206 b -0.630859 -0.514597 ''' pd.merge(df,k1_means,left_on='key1',right_index=True) #以上可以看作np.mean函数对两个数据列进行转换,使用transform方法如下 key =['one','two','one','two','one'] people.groupby(key).mean() people.groupby(key).transform(np.mean)#transform将函数应用到各分组
9、apply一般性的拆分-应用-合并
appley会将待处理的对象拆分成多个片段,然后对个片段调用传入的函数,最后尝试将各片段组合到一起
#根据分组选出最高5个tip_pct值 def top(df,n =5, column='tip_pct'): return df.sort_values(column, ascending=False)[:n] top(tips,n=6) tips.groupby("smoker").apply(top) #如果传给apply的函数能够接受其他参数或者关键字,则可以将这些内容放在函数后面一并传入 tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill") result = tips.groupby("smoker")["tip_pct"].describe() result.unstack("smoker") #如上在groupby中调用describe之类方法,实际上应用了以下代码 def f(group): return group.describe() grouped.apply(f)
禁止分组键
#group_keys=False to groupby 可以实现 tips.groupby("smoker", group_keys=False).apply(top)
分位数和桶分析
frame = pd.DataFrame({"data1": np.random.standard_normal(1000), "data2": np.random.standard_normal(1000)}) factor =pd.cut(frame.data1,4) #对cut返回的factor对象可直接用groupby def get_stats(group): return pd.DataFrame( {"min": group.min(), "max": group.max(),"count": group.count(), "mean": group.mean()}) grouped = frame.groupby(factor) grouped.apply(get_stats) grouped.agg(["min", "max", "count", "mean"]) #返回分位数编号 quartiles_samp = pd.qcut(frame["data1"], 4, labels=False) quartiles_samp.head() grouped = frame.groupby(quartiles_samp) grouped.apply(get_stats)