通过抓包分析,可得数据不是动态加载出来的,而是静态页面,故我们可以直接发送请求到页面,就可以获得数据了
通过F12调试工具可以得到页面数据,即,该页面的数据,存储在一个类名为grid_view
的ol标签里面,同时该类名在页面中唯一,故我们可以使用这个节点定位到我们的数据,遍历li标签,获取内容
""" 1. https://movie.douban.com/top250?start=0 2. https://movie.douban.com/top250?start=25 3. https://movie.douban.com/top250?start=50 n. https://movie.douban.com/top250?start=25*(n-1) """ urls = [https://movie.douban.com/top250?start=25*(i-1) for i in range(11)] # 其总共有250部电影
故,我们可以使用for循环,或者是先生成链接,采用入栈的形式来访问,亦或者可以使用递归的方式来爬取页面
# !/usr/bin/python3 # -*- coding: UTF-8 -*- __author__ = "A.L.Kun" __file__ = "123.py" __time__ = "2022/7/6 10:19" import requests, re # 导入模块 from lxml import etree # 进行xpath解析 from fake_useragent import UserAgent # 使用随机请求头 import pandas as pd # 导入数据解析模快 urls = [f'https://movie.douban.com/top250?start={25*(i-1)}' for i in range(10, 0, -1)] # 获取全部的url链接,存储为全局变量,使用出栈的方式,所以采取倒序 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'movie.douban.com', 'Pragma': 'no-cache', 'sec-ch-ua': '"Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'user-agent': UserAgent().random } lis_data = [] # 存储爬取到的数据 while urls: print(urls.pop())
def get_tags(url): headers.update({'user-agent': UserAgent().random}) # 使得user-agent足够随机 resp = requests.get(url, headers=headers) # 发送请求 resp.encoding = "utf-8" # 设置编码 tree = etree.HTML(resp.text) # 得到页面源码,交给etree ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li') # 获取到ol for li in ol: print(li) get_tags("https://movie.douban.com/top250?start=0")
def get_data(li): imgSrc = li.xpath(".//img/@src")[0] # 图片链接 try: imgSrc = imgSrc.replace("webp", "jpg") except Exception as e: imgSrc = "图片没有找到" title = li.xpath(".//img/@alt")[0] # 标题 detailUrl = li.xpath(".//div[@class='hd']/a/@href")[0] # 详细地址 detail = li.xpath(".//div[@class='bd']/p[1]/text()") # 里面包含导演,年份,类型,我们只要年份和类型 time = re.search(r"\d+", detail[1]).group() # 出版年份 type_ = " ".join(re.findall(r"[\u4e00-\u9fa5]+", detail[1])) # 电影类型 score = li.xpath(".//span[@class='rating_num']/text()")[0] # 获取评分 quote = li.xpath(".//span[@class='inq']/text()")[0] # 电影格言 # print(title, imgSrc, detailUrl, time, type_, score, quote) # 输出获取的数据 lis_data.append({ "标题": title, "图片链接": imgSrc, "详情页链接": detailUrl, "出版时间": time, "电影类型": type_, "评分": score, "格言": quote }) # 把结果存储到准备好的容器中,提交给pandas库进行解析,这里也可以将数据写入数据库 # 测试使用 resp = requests.get("https://movie.douban.com/top250?start=25", headers=headers) resp.encoding = "utf-8" tree = etree.HTML(resp.text) # 得到页面源码,交给etree ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li') # 获取到ol for li in ol : get_data(li) print(lis_data)
def parse_data(): df = pd.DataFrame(lis_data) new_df = df.dropna() # 可以,丢弃为空的数据 # 同时也还可以做一些图表分析等工作,这里就省略 # print(new_df) new_df.to_excel("./douban.xlsx", index=None) parse_data()
# !/usr/bin/python3 # -*- coding: UTF-8 -*- __author__ = "A.L.Kun" __file__ = "123.py" __time__ = "2022/7/6 10:19" import requests, re # 导入模块 from lxml import etree # 进行xpath解析 from fake_useragent import UserAgent # 使用随机请求头 import pandas as pd # 导入数据解析模快 from logging import Logger log = Logger(__name__) urls = [f'https://movie.douban.com/top250?start={25*(i-1)}' for i in range(10, 0, -1)] # 获取全部的url链接,存储为全局变量,使用出栈的方式,所以采取倒序 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'movie.douban.com', 'Pragma': 'no-cache', 'sec-ch-ua': '"Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'user-agent': UserAgent().random } lis_data = [] # 存储爬取到的数据 def get_data(li): imgSrc = li.xpath(".//img/@src")[0] # 图片链接 try: imgSrc = imgSrc.replace("webp", "jpg") except Exception as e: imgSrc = "图片没有找到" title = li.xpath(".//img/@alt")[0] # 标题 detailUrl = li.xpath(".//div[@class='hd']/a/@href")[0] # 详细地址 detail = li.xpath(".//div[@class='bd']/p[1]/text()") # 里面包含导演,年份,类型,我们只要年份和类型 time = re.search(r"\d+", detail[1]).group() # 出版年份 type_ = " ".join(re.findall(r"[\u4e00-\u9fa5]+", detail[1])) # 电影类型 score = li.xpath(".//span[@class='rating_num']/text()")[0] # 获取评分 try: quote = li.xpath(".//span[@class='inq']/text()")[0] # 电影格言 except Exception as e: quote = "暂时没有格言哦!" # print(title, imgSrc, detailUrl, time, type_, score, quote) # 输出获取的数据 lis_data.append({ "标题": title, "图片链接": imgSrc, "详情页链接": detailUrl, "出版时间": time, "电影类型": type_, "评分": score, "格言": quote }) # 把结果存储到准备好的容器中,提交给pandas库进行解析,这里也可以将数据写入数据库 def get_tags(url): headers.update({'user-agent': UserAgent().random}) # 使得user-agent足够随机 resp = requests.get(url, headers=headers) # 发送请求 resp.encoding = "utf-8" # 设置编码 tree = etree.HTML(resp.text) # 得到页面源码,交给etree ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li') # 获取到ol for li in ol: get_data(li) # 获取到数据 log.info(f"{url},数据获取完成") def parse_data(): df = pd.DataFrame(lis_data) new_df = df.dropna() # 可以,丢弃为空的数据 # 同时也还可以做一些图表分析等工作,这里就省略 # print(new_df) new_df.to_excel("./douban.xlsx", index=None, encoding="utf-8") # print(new_df) def main(): while urls: get_tags(urls.pop()) parse_data() if __name__ == "__main__": main()