pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。你很快就会发现,它是使Python成为强大而高效的数据分析环境的重要因素之一。
import numpy as np #数据读取和显示 import pandas as pd food_info = pd.read_csv ("food_info.csv") print(type(food_info)) print(food_info.dtypes) print(help(pd.read_csv)) print(food_info.head()) print(food_info.tail()) #具体显示列名,返回列名list结构 print(food_info.columns) #看维度,看数据规模 print(food_info.shape) #取某列所需数据 print(food_info.loc[0]) #通过切片取数 print(food_info.loc[3:6]) #按列名进行定位 print(food_info["NDB_No"]) columns = ["Zinc_(mg)","Copper_(mg)"] zinc_copper = food_info[columns] #查找 col_names = food_info.columns.tolist() print(col_names) gram_columns = [] for c in col_names: if c.endswith("(g)"): gram_columns.append(c) gram_df = food_info[gram_columns] print(gram_df.head(3)) #做加减乘除 print(food_info["Iron_(mg)"]) div_1000 = food_info["Iron_(mg)"]/1000 print(div_1000) #乘法,对应位置相乘 water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"] print(water_energy) #加一个列 iron_grams = food_info["Iron_(mg)"]/1000 print(food_info.shape) food_info["Iron_(g)"] = iron_grams print(food_info.shape) #求最大、最小、均值等 max_calories = food_info["Energ_Kcal"].max() #排序,新生成还是在原列上排序,排升序还是降序 food_info.sort_values("Sodium_(mg)",inplace=True) print(food_info["Sodium_(mg)"]) food_info.sort_values("Sodium_(mg)",inplace=True,ascending=False) print(food_info["Sodium_(mg)"]) #泰坦尼克号数据预处理实例 titanic_survival = pd.read_csv("titanic_train.csv") print(titanic_survival.head()) #统计年龄为空的行数 age = titanic_survival["Age"] age_is_null = pd.isnull(age) age_null_true = age[age_is_null] age_null_count = len(age_null_true) print(age_null_count) #如果对缺失值不处理,做分母将报错 mean_age = sum(titanic_survival['Age'])/len(titanic_survival['Age']) print(mean_age) good_ages = titanic_survival["Age"][age_is_null == False] correct_mean_age = sum(good_ages)/len(good_ages) print(correct_mean_age) #现成的函数 correct_mean_age = titanic_survival["Age"].mean() print(correct_mean_age) #不同船舱等级船票价格,不同属性的关系的函数pivot_table passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean) print(passenger_survival) #缺省状态下为求平均 passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age") print(passenger_age) #求和 port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc = np.sum) print(port_stats) #丢掉缺失值 drop_na_columns = titanic_survival.dropna(axis = 1) new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"]) #定位到具体值 row_index_83_age = titanic_survival.loc[83,"Age"] row_index_1000_pclass = titanic_survival.loc[766,"Pclass"] print(row_index_83_age) print(row_index_1000_pclass) #排序 new_titanic_survival = titanic_survival.sort_values("Age",ascending=False) print(new_titanic_survival[0:10]) titanic_reindexed = new_titanic_survival.reset_index(drop=True) print(titanic_reindexed) #自定义函数 取100行数据 def hundredth_rwo(column): hundredth_item = column.loc[99] return hundredth_item hundredth_rwo = titanic_survival.apply(hundredth_rwo) print('******',hundredth_rwo) #自定义函数,判断空值 def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(not_null_count) #自定义函数,连续值离散化 def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age < 18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label,axis=1) print(age_labels) #电影评分案例 fandango = pd.read_csv('fandango_score_comparison.csv') series_film = fandango['FILM'] print(type(series_film)) print(series_film[0:5]) series_rt = fandango['RottenTomatoes'] print(series_rt[0:5]) from pandas import Series film_names = series_film.values print(type(film_names)) rt_scores = series_rt.values series_custom = Series(rt_scores,index=film_names) series_custom[['Minions (2015)','Leviathan (2014)']] fiveten = series_custom[5:10] print(fiveten)
Series:一维数组,与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近。Series如今能保存不同种数据类型,字符串、boolean值、数字等都能保存在Series中。
Time- Series:以时间为索引的Series。
DataFrame:二维的表格型数据结构。很多功能与R中的data.frame类似。可以将DataFrame理解为Series的容器。
Panel :三维的数组,可以理解为DataFrame的容器。
Panel4D:是像Panel一样的4维数据容器。
PanelND:拥有factory集合,可以创建像Panel4D一样N维命名容器的模块。