import pandas as pd p = pd.Series([1, 2,3]) print(p) # 输出 #0 1 #1 2 #2 3 #dtype: int64
输出时,左边为索引内容,可以自己指定索引
import numpy as np import pandas as pd a = pd.Series([1, 2, 3], index = ['a', 'b', 'c']) print(a) # 输出 # a 1 # b 2 # c 3 # dtype: int64
字典创建Series
import pandas as pd dic = { "name":"小明", "age":18 } a = pd.Series(dic) print(a) # 输出 # name 小明 # age 18 # dtype: object
与Numpy相同,在Series中也可以修改数据类型
import pandas as pd a = pd.Series([1,2,3]).astype("float") print(a) # 输出 # 0 1.0 # 1 2.0 # 2 3.0 # dtype: float64
import numpy as np import pandas as pd a = pd.Series(np.linspace(1, 10, 10)) b = pd.Series([1, 2, 3], index=["a", "b", "c"]) print(a) print(b) # 输出 # 0 1.0 # 1 2.0 # 2 3.0 # 3 4.0 # 4 5.0 # 5 6.0 # 6 7.0 # 7 8.0 # 8 9.0 # 9 10.0 # dtype: float64 # a 1 # b 2 # c 3 # dtype: int64 # 根据索引取值 print(a[1], b["a"]) # 输出 # 2.0 1 # 取不连续的多行 print(a[[1,2]], b[["a", "c"]]) # 输出 # 1 2.0 # 2 3.0 # dtype: float64 # a 1 # c 3 # dtype: int64 # Series的布尔索引 print(a[a>6]) # 输出 # 6 7.0 # 7 8.0 # 8 9.0 # 9 10.0 # dtype: float64 # Series的取索引操作 print(a.index) # 输出 # dtype: float64 # RangeIndex(start=0, stop=10, step=1) # Series的取值操作 print(a.values) # 输出 # [ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.] # Series的clip与where # clip与numpy中的基本相同 print(a.clip(5, 8)) # 输出 # 0 5.0 # 1 5.0 # 2 5.0 # 3 5.0 # 4 5.0 # 5 6.0 # 6 7.0 # 7 8.0 # 8 8.0 # 9 8.0 # dtype: float64 # where 略有不同 pass
pandas拥有很强的数据兼容供能,可以读取csv\sql\json\Excel等数据,这里以json与csv为例
import pandas as pd # df1 = pd.read_csv("文件路径") df2 = pd.read_json(data = pd.read_json('../data/douban2.json', lines=True)) # 这里douban2.json是作者写的一个爬虫,保存格式为json # 设置line=True表示逐行读取
json文件如下
{"title": "肖申克的救赎", "action": "[可播放]", "score": "9.7", "view": "2338672人评价", "content": "希望让人自由。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.6", "view": "1739533人评价", "content": "风华绝代。"} {"title": "霸王别姬", "action": "[可播放]", "score": "9.5", "view": "1760844人评价", "content": "一部美国近现代史。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.4", "view": "1937156人评价", "content": "怪蜀黍和小萝莉不得不说的故事。"} {"title": "阿甘正传", "action": "[可播放]", "score": "9.4", "view": "1722688人评价", "content": "失去的才是永恒的。 "} {"title": "[可播放]", "action": "[可播放]", "score": "9.5", "view": "1084302人评价", "content": "最美的谎言。"} {"title": "这个杀手不太冷", "action": "[可播放]", "score": "9.4", "view": "1838449人评价", "content": "最好的宫崎骏,最好的久石让。 "} {"title": "泰坦尼克号", "action": "[可播放]", "score": "9.5", "view": "899104人评价", "content": "拯救一个人,就是拯救整个世界。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.3", "view": "1701589人评价", "content": "诺兰给了我们一场无法盗取的梦。"} {"title": "美丽人生", "action": "[可播放]", "score": "9.4", "view": "1168718人评价", "content": "永远都不能忘记你所爱的人。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.3", "view": "1374530人评价", "content": "爱是一种力量,让我们超越时空感知它的存在。"} {"title": "千与千寻", "action": "[可播放]", "score": "9.3", "view": "1289654人评价", "content": "如果再也不能见到你,祝你早安,午安,晚安。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.3", "view": "1382478人评价", "content": "每个人都要走一条自己坚定了的路,就算是粉身碎骨。 "} {"title": "辛德勒的名单", "action": "[可播放]", "score": "9.2", "view": "1553658人评价", "content": "英俊版憨豆,高情商版谢耳朵。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.3", "view": "1093337人评价", "content": "小瓦力,大人生。"} {"title": "盗梦空间", "action": "[可播放]", "score": "9.3", "view": "1076781人评价", "content": "天籁一般的童声,是最接近上帝的存在。 "} {"title": "[可播放]", "action": "[可播放]", "score": "9.2", "view": "1256552人评价", "content": "一生所爱。"} {"title": "忠犬八公的故事", "action": "[可播放]", "score": "9.2", "view": "1518499人评价", "content": "迪士尼给我们营造的乌托邦就是这样,永远善良勇敢,永远出乎意料。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.2", "view": "1044286人评价", "content": "香港电影史上永不过时的杰作。"} {"title": "星际穿越", "action": "[可播放]", "score": "9.3", "view": "763228人评价", "content": "我们一路奋战不是为了改变世界,而是为了不让世界改变我们。"} {"title": "[可播放]", "action": "[可播放]", "score": "9.3", "view": "764229人评价", "content": "千万不要记恨你的对手,这样会让你失去理智。"} {"title": "楚门的世界", "action": "[可播放]", "score": "9.1", "view": "1248441人评价", "content": "平民励志片。 "} {"title": "[可播放]", "action": "[可播放]", "score": "9.2", "view": "1041278人评价", "content": "人人心中都有个龙猫,童年就永远不会消失。"}
我们想要提取数据到数组字典内,可以使用如下方法
data = df2.tolist() data_list = [] for i in data: dict = {} dict["title"] = i[0] dict["count"] = i[2] dict["view"] = i[4] data_list.append(dict) # 也可以将data_list转化成dataframe类 完成数据的提取 data_list = pd.DataFrame(data_list)
import pandas as pd import numpy as np # index指定行索引,columns指定列索引 d = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"),columns=list("wxyz")) print(d) # 输出 # w x y z # a 0 1 2 3 # b 4 5 6 7 # c 8 9 10 11 # 字典创建 dic = { "name":["小明", "小红"], "age":[18, 19] } d1 = pd.DataFrame(dic) print(d1) # 输出 不难看出,键名为列索引 # name age # 0 小明 18 # 1 小红 19 # 列表字典创建法 lis = [ {"name":"小明","age": 18}, {"name":"小红","age": 19} ] d2 = pd.DataFrame(lis) print(d2) # name age # 0 小明 18 # 1 小红 19
import pandas as pd # 加载爬虫数据 df = pd.read_csv(r'E:\拜师\14100_HM数据科学库课件\DataAnalysis-master\day04\dogNames2.csv') # 将行列排序 默认升序, ascending设为False为降序 df = df.sortvalues(by="Count_AnimalName",ascending=False) print(df) # 显示前五行 head 默认前十行 print(df.head(5)) # 显示后五行 tail 默认后十行 print(df.tail(5)) # 取df的前二十行 注意显示与取出的区别 print(df[:20]) # 取df的某一列 print(df["Row_Labels"]) # 可以取某一行的某一列 print(df[:20]["Row_Labels"]) # 但是 对 DataFrame切片时通常不这样做 # 我们是用iloc 与 loc 方法切片
iloc 与 loc 的区别
import pandas as pd import numpy as np d = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns = list("wxyz")) print(d) # iloc法 # 获取d的第三行所有元素 print(d.iloc[2:]) # 获取d的第一 三 行所有元素 print(d.iloc[[0, 2]]) # 输出 # w x y z # a 0 1 2 3 # c 8 9 10 11 # 获取d 的 第一列所有元素 print(d.iloc[:,0]) # 输出 # a 0 # b 4 # c 8 # Name: w, dtype: int32 # 获取d 的第一 三 列所有元素 print(d.iloc[:,[0, 2]]) # w y # a 0 2 # b 4 6 # c 8 10 # 获取d 的第 二三行 与三四列的元素 print(d.iloc[1:3,2:4]) # y z # b 6 7 # c 10 11 # 接下来是loc方法 # loc方法与iloc方法基本相同 # 获取a 行 print(d.loc[a]) # w 0 # x 1 # y 2 # z 3 # 取a行与c行 print(d.loc[["a", "c"]]) # w x y z # a 0 1 2 3 # c 8 9 10 11 # 其他取法不再赘述 # 注意 这里的c行是可以取到的 不同与以往的range print(d.loc["a":"c"]) # w x y z # a 0 1 2 3 # b 4 5 6 7 # c 8 9 10 11