1-1 初识pandas
创建序列
s = pd.Series([1, 3, 6, np.nan, 44, 1]) print(s) """ 0 1.0 1 3.0 2 6.0 3 NaN 4 44.0 5 1.0 dtype: float64 """
创建日期型数据,并增长六个日期
date = pd.date_range('2016-01-01', periods = 6) print(date) """ DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04', '2016-01-05', '2016-01-06'], dtype='datetime64[ns]', freq='D') """
创建有索引和字符串的数据
df = pd.DataFrame(np.random.rand(6,4), index=date, columns=['a', 'b', 'c', 'd']) print(df) """ a b c d 2016-01-01 0.113951 0.583000 0.167336 0.917897 2016-01-02 0.632843 0.950597 0.280311 0.946806 2016-01-03 0.367501 0.313236 0.475095 0.889570 2016-01-04 0.653676 0.444720 0.091550 0.272699 2016-01-05 0.448919 0.328602 0.644945 0.196358 2016-01-06 0.656723 0.355628 0.886951 0.688788 """
不加索引的矩阵
df1 = pd.DataFrame(np.arange(12).reshape((3,4))) print(df1) # 创建决定各个数值的矩阵 categorical 明确的,确定的 df2 = pd.DataFrame({'A':1, 'B':pd.Timestamp('20120202'), 'C':pd.Series(1, index=list(range(4)), dtype='float32'), 'D':np.array([3]*4, dtype='int32'), 'E':pd.Categorical(["test", "train", "test", "train"]), 'F':'foo' }) print(df2) """ A B C D E F 0 1 2012-02-02 1.0 3 test foo 1 1 2012-02-02 1.0 3 train foo 2 1 2012-02-02 1.0 3 test foo 3 1 2012-02-02 1.0 3 train foo """ # df2 的dtype print(df2.dtypes) """ A int64 B datetime64[ns] C float32 D int32 E category F object dtype: object """ # 列标 print(df2.columns) "Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')" # 值 print(df2.values) """ [[1 Timestamp('2012-02-02 00:00:00') 1.0 3 'test' 'foo'] [1 Timestamp('2012-02-02 00:00:00') 1.0 3 'train' 'foo'] [1 Timestamp('2012-02-02 00:00:00') 1.0 3 'test' 'foo'] [1 Timestamp('2012-02-02 00:00:00') 1.0 3 'train' 'foo']] """ # 描述 print(df2.describe()) """ A C D count 4.0 4.0 4.0 mean 1.0 1.0 3.0 std 0.0 0.0 0.0 min 1.0 1.0 3.0 25% 1.0 1.0 3.0 50% 1.0 1.0 3.0 75% 1.0 1.0 3.0 max 1.0 1.0 3.0 """ # 转置 print(df2.T) # 按列或者行排序 print(df2.sort_index(axis=1, ascending=False)) """ F E D C B A 0 foo test 3 1.0 2012-02-02 1 1 foo train 3 1.0 2012-02-02 1 2 foo test 3 1.0 2012-02-02 1 3 foo train 3 1.0 2012-02-02 1 """ # 输出部分索引的值并排序 print(df2.sort_values(by='E')) """ A B C D E F 0 1 2012-02-02 1.0 3 test foo 2 1 2012-02-02 1.0 3 test foo 1 1 2012-02-02 1.0 3 train foo 3 1 2012-02-02 1.0 3 train foonn """
1-2 数据处理
# 先创建数据备用 date = pd.date_range('20130101', periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)), index=date, columns=['A', 'B', 'C', 'D']) print(df) # 多种方式打印 print(df['A']) print(df.A) # 打印前三行 print(df[0:3]) # print('\n'*1) # 打印后三行 print(df['2013-01-04':'2013-01-06']) # select by label:loc print(df.loc['2013-01-06']) """ A 20 B 21 C 22 D 23 Name: 2013-01-06 00:00:00, dtype: int32 """ print(df.loc[:,['A', 'B']]) """ A B 2013-01-01 0 1 2013-01-02 4 5 2013-01-03 8 9 2013-01-04 12 13 2013-01-05 16 17 2013-01-06 20 21 """ print(df.loc['2013-01-03', ['A', 'B']]) """ A 8 B 9 Name: 2013-01-03 00:00:00, dtype: int32 """ # select by of position:iloc print(df.iloc[[1, 3, 5], 1:3]) """ B C 2013-01-02 5 6 2013-01-04 13 14 2013-01-06 21 22 """ # select mixed: ix # print(df.ix[:3, ['A', 'B', 'C']]) # boolean indexing print(df) print(df<8) print(df[df.A<8]) """ A B C D 2013-01-01 True True True True 2013-01-02 True True True True 2013-01-03 False False False False 2013-01-04 False False False False 2013-01-05 False False False False 2013-01-06 False False False False A B C D 2013-01-01 0 1 2 3 2013-01-02 4 5 6 7 """
1-3 设置数据值
date = pd.date_range('20100101', periods=6) df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=date, columns=['A', 'B', 'C', 'D']) print(df) # set values by position: iloc df.iloc[2,2] = 111 print(df) # set values by label: loc df.loc['20100103', 'B'] = 999 print(df) # 设置>大于某个数的值 改变全部值 # df[df.A>0] = 0 # print(df) """ A B C D 2010-01-01 0 1 2 3 2010-01-02 0 0 0 0 2010-01-03 0 0 0 0 2010-01-04 0 0 0 0 2010-01-05 0 0 0 0 2010-01-06 0 0 0 0 """ # 改变一部分的值 # df.A[df.A>3] = 0 # print(df) """ A B C D 2010-01-01 0 1 2 3 2010-01-02 0 5 6 7 2010-01-03 0 999 111 11 2010-01-04 0 13 14 15 2010-01-05 0 17 18 19 2010-01-06 0 21 22 23 """ # 增加值 df['F'] = np.nan print(df) """ A B C D F 2010-01-01 0 1 2 3 NaN 2010-01-02 4 5 6 7 NaN 2010-01-03 8 999 111 11 NaN 2010-01-04 12 13 14 15 NaN 2010-01-05 16 17 18 19 NaN 2010-01-06 20 21 22 23 NaN """ df['E'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20100101', periods=6)) print(df) A B C D F E 2010-01-01 0 1 2 3 NaN 1 2010-01-02 4 5 6 7 NaN 2 2010-01-03 8 999 111 11 NaN 3 2010-01-04 12 13 14 15 NaN 4 2010-01-05 16 17 18 19 NaN 5 2010-01-06 20 21 22 23 NaN 6
1-4 处理丢失的数据
import numpy as np import pandas as pd # 原始数据 date = pd.date_range('20100101', periods=6) df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=date, columns=['A', 'B', 'C', 'D']) print(df) df.iloc[0, 1] = np.nan df.iloc[1, 2] = np.nan print(df) # 处理数据-丢掉有nan的整行数据 any-只要行或列中年有nan就算,all-要全部都为nan才算 print(df.dropna(axis=0, how='any')) # axis = 0, 竖直方向对行进行操作 print(df.dropna(axis=1, how='any')) # axis = 1 水平方向对列进行操作 """ A B C D 2010-01-03 8 9.0 10.0 11 2010-01-04 12 13.0 14.0 15 2010-01-05 16 17.0 18.0 19 2010-01-06 20 21.0 22.0 23 A D 2010-01-01 0 3 2010-01-02 4 7 2010-01-03 8 11 2010-01-04 12 15 2010-01-05 16 19 2010-01-06 20 23 """ # 将nan的位置填入其他值,如0 print(df.fillna(value=0)) """ A B C D 2010-01-01 0 0.0 2.0 3 2010-01-02 4 5.0 0.0 7 2010-01-03 8 9.0 10.0 11 2010-01-04 12 13.0 14.0 15 2010-01-05 16 17.0 18.0 19 2010-01-06 20 21.0 22.0 23 """ # 打印True或者False的值 print(df.isnull()) """ A B C D 2010-01-01 False True False False 2010-01-02 False False True False 2010-01-03 False False False False 2010-01-04 False False False False 2010-01-05 False False False False 2010-01-06 False False False False """ # 打印至少一个是丢失的数据的判断情况 print(np.any(df.isnull())) # True
1-5 导入与导出数据
# 将csv文件读入,该文件可以再=在任意目录,将其读入就可以 data = pd.read_csv('C:/Users/liyuelong/Desktop/student.csv') print(data) # 转成其他模式,目录地址要相同。 data.to_pickle('C:/Users/liyuelong/Desktop/student.pickle')