#定位到2022必看热片 #提取子页面连接地址 #拿到想要的下载地址 import re import requests import csv header={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36" } url="https://dytt89.com/" requ=requests.get(url,headers=header) requ.encoding="gb2312" f = open("电影天堂.csv", mode="a", encoding="utf-8", newline='') csvwriter=csv.writer(f) #电影天堂网页用的是这个gb2312,utf-8会乱码 print(requ.text) obj1=re.compile(r'<span style="float:left;">综艺&动漫.*?<ul>(?P<ul>.*?)</ul>',re.S) obj2=re.compile(r"<li><a href='(?P<link>.*?)' title=.*?2022年(?P<name>.*?)</a><span>",re.S) obj3=re.compile(r'<img alt="" src="(?P<image>.*?)" style=.*?译 名 (?P<tit>.*?)<br />.*?<td style="WORD-WRAP:.*?<a href="(?P<link2>.*?)">magnet',re.S) result=obj1.finditer(requ.text) herf_list=[] for i in result: ul=i.group("ul") #print(i.group("ul")) result2=obj2.finditer(ul) for i in result2: #拼接子页面url herf=url+i.group("link").strip("/") herf_list.append(herf) #把子页面列表列举出来 #print(herf) print(i.group("name")) for j in herf_list: requst=requests.get(j,headers=header) requst.encoding="gb2312" print(requst.text) rew=obj3.finditer(requst.text) for s in rew: print(s.group("tit")) dic=s.groupdict() csvwriter.writerow(dic.values()) print("over")
使用python爬取界面列表子页面链接,然后根据列表的子页面链接,爬取电影天堂的电影的链接和图片存储起来
繁体是因为这些字页面使用的是繁体的表示方法
根据这一思路,我爬取了b站所有动漫的子页面的简介,下面附上代码
#定位到动漫列表 #提取子页面连接地址 #拿到想要的下载地址 import requests import re import csv wq=1 while(wq<163): header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36" } dat={ "season_version":"-1", "spoken_language_type":"-1", "area":"-1", "is_finish":"-1", "copyright":"-1", "season_status":"-1", "season_month":"-1", "year":"-1", "style_id":"-1", "order":"4", "st":"1", "sort":"0", "page":f"{wq}", "season_type":"1", "pagesize":"20", "type":"1" } f=open("动漫简介.csv",mode="a",encoding="utf-8",newline='') csvwriter=csv.writer(f) url="https://api.bilibili.com/pgc/season/index/result/#" wq = wq + 1 requ=requests.get(url,headers=header,params=dat) print(requ.text) obj1=re.compile(r'"link":"(?P<link>.*?)","media_id"',re.S) obj2=re.compile(r'<meta property="og:title" content="(?P<title>.*?)"><meta property.*?:image" content=".*?"><meta name=".*?itemprop="description" content="(?P<jianjie>.*?)"><meta it',re.S) result=obj1.finditer(requ.text) link_list=[] for i in result: dis=i.group("link") print(dis) link_list.append(dis) for j in link_list: print(j) requ1=requests.get(j,headers=header,params=dat) #print(requ1.text) result1=obj2.finditer(requ1.text) for k in result1: print(k.group("title")) print(k.group("jianjie")) dic=k.groupdict().values() csvwriter.writerow(dic) #break #测试用
动画是按照评分的顺序