参考:https://www.runoob.com/pandas/pandas-tutorial.html
pip install pandas
Series 由索引(index)和列组成
pandas.Series( data, index, dtype, name, copy) import pandas as pd
import pandas as pd ## 自动序号 print(pd.Series([3,0.5,"liucd"])) ## 手动序号 print(pd.Series(["i","ii","iii"],[3,0.5,"liucd"])) ## 字典形式 print(pd.Series({"a1":"aa","a2":"bb"}))
a=pd.Series({"a1":"aa","a2":"bb"}) print(a["a1"],a[1]) #提取
输出
3 i 0.5 ii liucd iii dtype: object aa bb
DataFrame表格型 (行\列 &索引)
df=pd.read_csv("nba.csv") df1=pd.DataFrame(pd.read_csv("nba.csv",header=10)) # df2=pd.DataFrame(pd.read_excel("名单.xlsx")) print(df,df1)
df = pd.DataFrame({ "id":[1001,1002,1003,1004,1005,1006], "date":pd.date_range('20130102', periods=6), "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '], "age":[23,44,54,32,34,32], "category":['100-A','100-B','110-A','110-C','210-A','130-F'], "price":[1200,np.NaT,2133,5433,np.NaT,4432]}, columns =['id','date','city','category','age','price']) print(df)
df2=pd.DataFrame([ [1001,"liucd",372832197612068535,65,135.5,"男","山东","高兴"], [1002,"jiangdab",512750197406168531,29,120,"女","四川","高兴"], [1005,"xiaonz",371321199009113651,18,98,"女","山东","高兴"], [1004,"liuyh",np.NaT,15,108,"男","山东","高兴"]], columns=["编号","姓名","身份证号码","年龄","体重","性别","省份","情绪"]) print(df2)
Id date city category age price 0 1001 2013-01-02 Beijing 100-A 23 1200 1 1002 2013-01-03 SH 100-B 44 NaT 2 1003 2013-01-04 guangzhou 110-A 54 2133 3 1004 2013-01-05 Shenzhen 110-C 32 5433 4 1005 2013-01-06 shanghai 210-A 34 NaT 5 1006 2013-01-07 BEIJING 130-F 32 4432 编号 姓名 身份证号码 年龄 体重 性别 省份 情绪 0 1001 liucd 372832197612068535 65 135.5 男 山东 高兴 1 1002 jiangdab 512750197406168531 29 120.0 女 四川 高兴 2 1005 xiaonz 371321199009113651 18 98.0 女 山东 高兴 3 1004 liuyh NaT 15 108.0 男 山东 高兴
print(df2.loc[3]) #竖版 print(df2[3:4]) #横版
Index(['liucd', 'jiangdb'], dtype='object')
print(df2["姓名"])
0 liucd 1 jiangdab 2 xiaonz 3 liuyh Name: 姓名, dtype: object
print("维度(行列)查看",df2.shape) print("区域 2行 3列",df2.iloc[:2,:3]) print("区域 1,2,4行,2,5列",df2.iloc[[1,2,4],[2,5]])
区域 2行 3列 编号 姓名 身份证号码 0 1001 liucd 372832197612068535 1 1002 jiangdab 512750197406168531 区域 身份证号码 未成年 1 512750197406168531 N 2 371321199009113651 Y new 37132120611258530 1
print("info",df2.info())
print(df2.dtypes) print(df2["身份证号码"].dtype)
print(df2.isnull())
print(df2.values)
print("列标签",df2.columns) print("单一列标签",df2.columns[2])
print(df2.fillna(value=0))
print(df2.rename(columns={"姓名":"性命"}))
print(df2["姓名"].replace("xiaonz","xiaonz---2"))
df6=pd.DataFrame({ "日期" : ['2020/12/01', '2020/12/02', '20201226'], "销售额" : [50, 30, 78]},index=["day1","day2","day3"]) df6["日期"]=pd.to_datetime(df6["日期"]) #规范日期格式 print(df6)
日期 销售额 day1 2020-12-01 50 day2 2020-12-02 30 day3 2020-12-26 78
print(df2.set_index("身份证号码")) print(df2.reset_index())
print("*按列 排序*",df2.sort_values("编号"))
df2.insert(5,column="未成年",value=["Y","N","Y","Y"]) print("增加 未成年 列",df2)
df2.loc["new"]=[1008,"liuyi",37132120611258530,17,"Y",128,"男","上海","特高"] print("增加 刘亿 行",df2)
df3=df2.copy() df3["未成年"][df3["年龄"]>=18]="成年" df3["未成年"][df3["年龄"]<18]="未成年" print("判断是否成年",df2)
bf4=df2[(df2["年龄"]<18) &(df2["省份"]=="山东")],"特别标记" print("成年+山东",bf4)
print("省份满足山东",df2[df2["省份"]=="山东"])
print("提取上海和四川的",df2[df2["省份"].isin(["上海","四川"])])
print("数据汇总 纵向",df2["年龄"].sum()) print("数据汇总 横向",df2.sum(axis=1))
print(df2.groupby("省份").count())
CSV 数字+文本 = 表格 逗号 分割
data=pd.read_csv("nbb.csv",encoding="utf-8") print(data) #打印全部 print(data.to_string()) #打印全部 print(data.head(10)) #打印前十行
姓名 年龄 省份 0 liucd 28 shandong 1 jiangdb 18 sichuan 2 xiaonz 22 shandong
data1=pd.DataFrame([["liucd",28,"shandong"],["jiangdb",18,"sichuan"],["xiaonz",22,"shandong"],["哎"]],columns=["姓名","年龄","省份"]) data1.to_csv("名单.csv") #写入 print(data1)
print(data1.info()) #相关信息
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4 entries, 0 to 3 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 姓名 4 non-null object #多一个数字 1 年龄 3 non-null float64 2 省份 3 non-null object dtypes: float64(1), object(2) memory usage: 224.0+ bytes