用pandas封装函数对数据进行读取,预处理,数据分析等操作。
pandas库是基于numpy库编写的, 在命令行窗口安装完numpy后,安装pandas:pip install pandas。
相关numpy库的内容参考
http://blog.csdn.net/cymy001/article/details/78163468
通常需要pandas读取的数据文件的文本格式为.txt,.csv,.json
pandas里定义的数据类型:
(1.)object字符值(2.)int整型(3.)float浮点型(4.)datatime时间值(5.)bool布尔值
#Python pandas introduce #导入数据集 #import csv #food_info=csv.reader('D:\PYTHON35\idle\database\pandas\food_info.csv') #print(type(food_info)) ##<class '_csv.reader'> import pandas as pd import os food_info_site = r"D:\PYTHON35\idle\database\pandas\food_info.csv" pwd = os.getcwd() #获取当前工作目录 os.chdir(os.path.dirname(food_info_site)) #os.chdir改变当前工作目录到指定参数目录,os.path.dirname获取参数路径所在文件夹地址 food_info = pd.read_csv(os.path.basename(food_info_site)) #read_csv的参数只能是文件名,不能是地址 #os.path.basename返回文件名,无论参数是一个路径还是一个文件(这里food_info是路径) print(type(food_info)) #<class 'pandas.core.frame.DataFrame'>,pandas读进来的数据流的格式dataframe print(food_info.dtypes) #查看数据集food_info各列的数据类型,每一列的格式相同 print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") first_rows=food_info.head(3) #head方法读取数据集food_info的前几行,默认参数是5 print(first_rows) print(food_info.columns) #columns方法查看数据集每一列都是什么特征 print(food_info.shape) #shape方法查看数据集维度,样本有多少行多少列 print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print(food_info.loc[0]) #loc方法查找某些行,从数据开始读,不算标题 #pandas里object类型相当于python里的str类型 print(food_info.loc[6]) #打印第6行,参数不能超过数据集最大的行号 print(food_info.loc[3:6]) #打印出来多行——3,4,5,6行 two_five_ten=[2,5,10] print(food_info.loc[two_five_ten]) #挑选打印2,5,10行 print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ndb_col=food_info['NDB_No'] #参数是列名称 print(ndb_col) #查找数据集的某一列 print('_________________________________________') columns=['Zinc_(mg)','Copper_(mg)'] #一次查找多个列 zinc_copper=food_info[columns] print(zinc_copper) #print(food_info[['Zinc_(mg)','Copper_(mg)']]) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #查找出以(g)为单位的列属性 print(food_info.columns) print(food_info.head(2)) col_names=food_info.columns.tolist() #将列属性名字索引转化成列表 print(col_names) gram_columns=[] #挑选出属性单位是g的列,放入列表中,先创建一个空列表 for c in col_names: if c.endswith('(g)'): gram_columns.append(c) gram_df=food_info[gram_columns] #查找出gram_columns列表中包含的列 print(gram_df.head(3)) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #对以(mg)为单位的列转化成以(g)为单位的数值计算 div_1000=food_info['Iron_(mg)']/1000 #对Iron_(mg)列的值整列变换,除以1000 add_100=food_info['Iron_(mg)']+100 sub_100=food_info['Iron_(mg)']-100 mult_2=food_info['Iron_(mg)']*2 water_energy=food_info['Water_(g)']*food_info['Energ_Kcal'] #对两列运算,对应位置作运算 iron_grams=food_info['Iron_(mg)']/1000 food_info['Iron_(g)']=iron_grams #新得到的列,加入数据集,原数据集中没有Iron_(g)这一名字的列 weighted_protein=food_info['Protein_(g)']*2 weighted_fat=-0.75*food_info['Lipid_Tot_(g)'] initial_rating=weighted_protein+weighted_fat #对两列进行代数运算 print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #对不同列进行自己列的归一化处理 max_calories=food_info['Energ_Kcal'].max() normalized_calories=food_info['Energ_Kcal']/max_calories #除以当前列最大值 normalized_protein=food_info['Protein_(g)']/food_info['Protein_(g)'].max() food_info['Normalized_Protein']=normalized_protein #将归一化的值加入到数据集中 normalized_fat=food_info['Lipid_Tot_(g)']/food_info['Lipid_Tot_(g)'].max() food_info['Normalized_Fat']=normalized_fat print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #pandas读取数据的排序问题 print(food_info['Sodium_(mg)']) print('_________________________________________') food_info.sort_values('Sodium_(mg)',inplace=True) #sort_values方法从小到大排序,inplace是否新建一个新的Dataframe,True是不需要 print(food_info['Sodium_(mg)']) print('_________________________________________') food_info.sort_values('Sodium_(mg)',inplace=True,ascending=False) #ascending参数控制排序升降 print(food_info['Sodium_(mg)']) [/code] ```code import pandas as pd import numpy as np import os titanic_survival_site = r"D:\PYTHON35\idle\database\pandas\titanic_train.csv" pwd = os.getcwd() os.chdir(os.path.dirname(titanic_survival_site)) titanic_survival = pd.read_csv(os.path.basename(titanic_survival_site)) print(type(titanic_survival)) print(titanic_survival.head()) #<class 'pandas.core.frame.DataFrame'> print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #对数据集中的缺失值进行预处理 age=titanic_survival['Age'] print(age.loc[0:10]) #缺失值NaN age_is_null=pd.isnull(age) #isnull方法判断是否是缺失值,返回布尔值列表 print(age_is_null) #True代表值缺失,False代表不缺失 age_null_true=age[age_is_null] #取出age列中age_is_null为True对应的值,缺失值 print(len(age_null_true)) print('_________________________________________') mean_age=sum(titanic_survival['Age'])/len(titanic_survival['Age']) print(mean_age) #直接计算,由于有缺失值,输出nan good_ages=titanic_survival['Age'][age_is_null==False] #把age列不缺失的值都取出 print(good_ages) correct_mean_age=sum(good_ages)/len(good_ages) print(correct_mean_age) print('_________________________________________') #pandas直接封装的API函数,自动过滤缺失值计算 correct_mean_age=titanic_survival['Age'].mean() #mean方法自动过滤缺失值 print(correct_mean_age) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #统计各仓位等级的船票价格 passenger_classes=[1,2,3] fares_by_class={} for this_class in passenger_classes: pclass_rows=titanic_survival[titanic_survival['Pclass']==this_class] #找出this_class等舱的人 pclass_fares=pclass_rows['Fare'] #对应人True的位置找出船票价格 fare_for_class=pclass_fares.mean() fares_by_class[this_class]=fare_for_class print(fares_by_class) print('_________________________________________') #找出两个量的关系,具体什么关系由aggfunc参数指定,aggfunc默认值就是求均值 passenger_survival=titanic_survival.pivot_table(index='Pclass',values='Survived',aggfunc=np.mean) #pivot_table方法,数据透视表,返回index和values这两个的关系 print(passenger_survival) #同时考虑某个量与其余多个量关系,values列表参数 port_stats=titanic_survival.pivot_table(index='Embarked',values=['Fare','Survived'],aggfunc=np.sum) print(port_stats) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #数据属性确实,把整行数据丢弃 drop_na_columns=titanic_survival.dropna(axis=1) #dropna方法是按行检查,每一行的任一项有缺失值,就把这一行去掉 new_titanic_survival=titanic_survival.dropna(axis=0,subset=['Age','Sex']) #考查Age和Sex列,有缺失值的行去掉 print(new_titanic_survival) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") row_index_83_age=titanic_survival.loc[83,'Age'] #loc方法,查看某一行的某一属性值 print(row_index_83_age) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #按某一列属性排序后,索引编号重新定义reset_index方法 new_titanic_survival=titanic_survival.sort_values('Age',ascending=False) print(new_titanic_survival[0:10]) titanic_reindex=new_titanic_survival.reset_index(drop=True) #对Age降序排列之后重新加索引 print(titanic_reindex.iloc[0:10]) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #自定义函数作用于Dataframe def hundredth_row(column): hundredth_item=column.iloc[99] return hundredth_item hundredth_row=titanic_survival.apply(hundredth_row) #apply方法参数为自定义函数,Dateframe apply这个自定义函数,就可以得到函数作用Dataframe结果 print(hundredth_row) print('_________________________________________') def not_null_count(column): column_null=pd.isnull(column) null=column[column_null] return len(null) column_null_count=titanic_survival.apply(not_null_count) print(column_null_count) print('_________________________________________') def which_class(row): pclass=row['Pclass'] if pd.isnull(pclass): return 'Unknown' elif pclass==1: return 'First Class' elif pclass==2: return 'Second Class' elif pclass==3: return 'Third Class' classes=titanic_survival.apply(which_class,axis=1) print(classes) [/code] **pandas的3种主要数据结构** : **Series** ——一些值的集合,数据元素;支持float,int,bool, datatime,timedelta,category,object类型 **DataFrame** ——Series的集合 **Panel** ——DataFrame的集合 ```code import pandas as pd import os fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv" pwd = os.getcwd() os.chdir(os.path.dirname(fandango_site)) fandango = pd.read_csv(os.path.basename(fandango_site)) print(type(fandango)) series_film=fandango['FILM'] #取第一列电影名 print(series_film[0:5]) #以“索引:电影名”形式列出 series_rt=fandango['RottenTomatoes'] #取第二列电影评分 print(series_rt[0:5]) print('_________________________________________') #Series可以改变索引,将任意指定属性列定义为索引列 from pandas import Series film_names=series_film.values #values方法,以列表形式给出对应名字 print(type(film_names)) print(film_names) rt_scores=series_rt.values print(rt_scores) series_custom=Series(rt_scores,index=film_names) #Series函数,index参数为索引列,另一个参数评分值为值 print(series_custom) print('_____________') print(series_custom[['Ant-Man (2015)','The Water Diviner (2015)']]) #通过电影名字找样本 print('_____________') #有了Series索引后,仍然可以利用index编号去寻找样本 fiveten=series_custom[5:10] print(fiveten) print('_____________') original_index=series_custom.index.tolist() #取按电影名字作为索引的列表 sorted_index=sorted(original_index) #按字母升序排列 sorted_by_index=series_custom.reindex(sorted_index) #reindex方法定义新索引 print(sorted_by_index) print('_____________') sc2=series_custom.sort_index() #先找到Series变量,按index排序 print(sc2[0:10]) sc3=series_custom.sort_values() #先找到Series变量,按values排序 print(sc3[0:10]) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print(type(series_custom.values)) #查看series创建对象的数据结构 #<class 'numpy.ndarray'> import numpy as np print(np.add(series_custom,series_custom)) #series对象可以进行numpy的相关函数操作 print(np.sin(series_custom)) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #过滤操作 criteria_one=series_custom>50 criteria_two=series_custom<75 both_criteria=series_custom[criteria_one & criteria_two] print(both_criteria) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #对两个Series进行算术运算 rt_critics=Series(fandango['RottenTomatoes'].values,index=fandango['FILM']) re_users=Series(fandango['RottenTomatoes_User'].values,index=fandango['FILM']) rt_mean=(rt_critics+re_users)/2 print(rt_mean) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") [/code] ```code #在DataFrame结构里设置索引列 import pandas as pd import os fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv" pwd = os.getcwd() os.chdir(os.path.dirname(fandango_site)) fandango = pd.read_csv(os.path.basename(fandango_site)) print(type(fandango)) fandango_films=fandango.set_index('FILM',drop=False) #set_index函数设置‘FILM’列为索引项,drop参数False表示‘FILM’列还在,不仅是索引项,‘FILM’在值里也还可查 print(fandango_films.index) movies=['Ant-Man (2015)','The Water Diviner (2015)'] print(fandango_films.loc[movies]) #loc方法用电影名字索引做参数查找对应行信息 print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #numpy和DataFrame结构结合使用 types=fandango_films.dtypes print(types) float_columns=types[types.values=='float64'].index #将每一列的值类型是float64的列属性取出来 float_df=fandango_films[float_columns] #将对应列的值取出来 deviations=float_df.apply(lambda x:np.std(x)) #利用,numpy里的std方法,对float64值类型对应的每一“列”求方差 print(deviations) rt_mt_user=float_df[['RT_user_norm','Metacritic_user_nom']] rowdeviation=rt_mt_user.apply(lambda x:np.std(x),axis=1) #axis=1参数,表示横向求方差 print(rowdeviation)