数据清洗是数据分析关键的一步,直接影响之后的处理工作
数据需要修改吗?有什么需要修改的吗?数据应该怎么调整才能适用于接下来的分析和挖掘?
是一个迭代的过程,实际项目中可能需要不止一次地执行这些清洗操作
import numpy as np import pandas as pd df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4, 'data2' : np.random.randint(0, 4, 8)}) print(df_obj) print(df_obj.duplicated())
运行结果:
# print(df_obj) data1 data2 0 a 3 1 a 2 2 a 3 3 a 3 4 b 1 5 b 0 6 b 3 7 b 0 # print(df_obj.duplicated()) 0 False 1 False 2 True 3 True 4 False 5 False 6 False 7 True dtype: bool
print(df_obj.drop_duplicates()) print(df_obj.drop_duplicates('data2'))
运行结果:
# print(df_obj.drop_duplicates()) data1 data2 0 a 3 1 a 2 4 b 1 5 b 0 6 b 3 # print(df_obj.drop_duplicates('data2')) data1 data2 0 a 3 1 a 2 4 b 1 5 b 0
根据map传入的函数对每行或每列进行转换
ser_obj = pd.Series(np.random.randint(0,10,10)) print(ser_obj) print(ser_obj.map(lambda x : x ** 2))
运行结果:
# print(ser_obj) 0 1 1 4 2 8 3 6 4 8 5 6 6 6 7 4 8 7 9 3 dtype: int64 # print(ser_obj.map(lambda x : x ** 2)) 0 1 1 16 2 64 3 36 4 64 5 36 6 36 7 16 8 49 9 9 dtype: int64
# 单个值替换单个值 print(ser_obj.replace(1, -100)) # 多个值替换一个值 print(ser_obj.replace([6, 8], -100)) # 多个值替换多个值 print(ser_obj.replace([4, 7], [-100, -200]))
运行结果:
# print(ser_obj.replace(1, -100)) 0 -100 1 4 2 8 3 6 4 8 5 6 6 6 7 4 8 7 9 3 dtype: int64 # print(ser_obj.replace([6, 8], -100)) 0 1 1 4 2 -100 3 -100 4 -100 5 -100 6 -100 7 4 8 7 9 3 dtype: int64 # print(ser_obj.replace([4, 7], [-100, -200])) 0 1 1 -100 2 8 3 6 4 8 5 6 6 6 7 -100 8 -200 9 3 dtype: int64
根据单个或多个键将不同DataFrame的行连接起来
类似数据库的连接操作
pd.merge:(left, right, how='inner',on=None,left_on=None, right_on=None )
left:合并时左边的DataFrame
right:合并时右边的DataFrame
how:合并的方式,默认'inner', 'outer', 'left', 'right'
on:需要合并的列名,必须两边都有的列名,并以 left 和 right 中的列名的交集作为连接键
left_on: left Dataframe中用作连接键的列
right_on: right Dataframe中用作连接键的列
内连接 inner:对两张表都有的键的交集进行联合
示例代码:
import pandas as pd import numpy as np left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) pd.merge(left,right,on='key') #指定连接键key
运行结果:
key A B C D 0 K0 A0 B0 C0 D0 1 K1 A1 B1 C1 D1 2 K2 A2 B2 C2 D2 3 K3 A3 B3 C3 D3
示例代码:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) pd.merge(left,right,on=['key1','key2']) #指定多个键,进行合并
运行结果:
key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2
#指定左连接 left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) pd.merge(left, right, how='left', on=['key1', 'key2']) key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K0 K1 A1 B1 NaN NaN 2 K1 K0 A2 B2 C1 D1 3 K1 K0 A2 B2 C2 D2 4 K2 K1 A3 B3 NaN NaN
#指定右连接 left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) pd.merge(left, right, how='right', on=['key1', 'key2']) key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2 3 K2 K0 NaN NaN C3 D3
默认是“内连接”(inner),即结果中的键是交集
how指定连接方式
示例代码:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) pd.merge(left,right,how='outer',on=['key1','key2'])
运行结果:
key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K0 K1 A1 B1 NaN NaN 2 K1 K0 A2 B2 C1 D1 3 K1 K0 A2 B2 C2 D2 4 K2 K1 A3 B3 NaN NaN 5 K2 K0 NaN NaN C3 D3
参数suffixes:默认为_x, _y
示例代码:
# 处理重复列名 df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data' : np.random.randint(0,10,7)}) df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data' : np.random.randint(0,10,3)}) print(pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right')))
运行结果:
data_left key data_right 0 9 b 1 1 5 b 1 2 1 b 1 3 2 a 8 4 2 a 8 5 5 a 8
参数left_index=True或right_index=True
示例代码:
# 按索引连接 df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : np.random.randint(0,10,7)}) df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd']) print(pd.merge(df_obj1, df_obj2, left_on='key', right_index=True))
运行结果:
data1 key data2 0 3 b 6 1 4 b 6 6 8 b 6 2 6 a 0 4 3 a 0 5 0 a 0
沿轴方向将多个对象合并到一起
np.concatenate
示例代码:
import numpy as np import pandas as pd arr1 = np.random.randint(0, 10, (3, 4)) arr2 = np.random.randint(0, 10, (3, 4)) print(arr1) print(arr2) print(np.concatenate([arr1, arr2])) print(np.concatenate([arr1, arr2], axis=1))
运行结果:
# print(arr1) [[3 3 0 8] [2 0 3 1] [4 8 8 2]] # print(arr2) [[6 8 7 3] [1 6 8 7] [1 4 7 1]] # print(np.concatenate([arr1, arr2])) [[3 3 0 8] [2 0 3 1] [4 8 8 2] [6 8 7 3] [1 6 8 7] [1 4 7 1]] # print(np.concatenate([arr1, arr2], axis=1)) [[3 3 0 8 6 8 7 3] [2 0 3 1 1 6 8 7] [4 8 8 2 1 4 7 1]]
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=list('abc'),columns=['one','two']) df2 = pd.DataFrame(np.arange(4).reshape(2,2)+5,index=list('ac'),columns=['three','four']) pd.concat([df1,df2]) #默认外连接,axis=0 four one three two a NaN 0.0 NaN 1.0 b NaN 2.0 NaN 3.0 c NaN 4.0 NaN 5.0 a 6.0 NaN 5.0 NaN c 8.0 NaN 7.0 NaN pd.concat([df1,df2],axis='columns') #指定axis=1连接 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0 #同样我们也可以指定连接的方式为inner pd.concat([df1,df2],axis=1,join='inner') one two three four a 0 1 5 6 c 4 5 7 8
示例代码:
import numpy as np import pandas as pd df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2']) print(df_obj) stacked = df_obj.stack() print(stacked)
运行结果:
# print(df_obj) data1 data2 0 7 9 1 7 8 2 8 9 3 4 1 4 1 2 # print(stacked) 0 data1 7 data2 9 1 data1 7 data2 8 2 data1 8 data2 9 3 data1 4 data2 1 4 data1 1 data2 2 dtype: int64
示例代码:
# 默认操作内层索引 print(stacked.unstack()) # 通过level指定操作索引的级别 print(stacked.unstack(level=0))
运行结果:
# print(stacked.unstack()) data1 data2 0 7 9 1 7 8 2 8 9 3 4 1 4 1 2 # print(stacked.unstack(level=0)) 0 1 2 3 4 data1 7 7 8 4 1 data2 9 8 9 1 2