爬取目标网站https://sc.chinaz.com/jianli/free.html
思路
思路捋清,直接上代码
# -*- codeing = utf-8 -*- # @Time : 2021/7/20 10:13 # @Author : ArthurHuang # @File : 10_xpath解析案例_站长素材中免费简历模板爬取.py # @Software : PyCharm import requests from lxml import html etree = html.etree #新版本etree现在需要这样导入 import os if __name__ == "__main__": url = 'http://sc.chinaz.com/jianli/free_%d.html' for page in range(1, 6): # 循环取前5页,每页20张简历 # UA伪装:将对应的User-Agent封装到一个字典中 headers = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 91.0.4472.77 Safari / 537.36" } if page == 1: # 第一页与其余几页的url不同,需要分开写 new_url = 'http://sc.chinaz.com/jianli/free.html' else: new_url = format(url % page) page_text = requests.get(url=new_url, headers=headers).text # 实例化etree对象 tree = etree.HTML(page_text) # 创建一个文件夹保存图片 if not os.path.exists('./jianliLibs'): os.mkdir('./jianliLibs') a_list = tree.xpath('//div[@id="container"]/div/a') for a in a_list: # 获取简历名称列表 all_titles = a.xpath('./img/@alt')[0]+'.zip' all_titles = all_titles.encode('iso-8859-1').decode('utf-8') # 通用处理中文乱码的解决方案 #print(all_titles) # 获取每个简历对应的单独网页地址 all_href = 'https:'+a.xpath('./@href')[0] response = requests.get(url=all_href, headers=headers) resume_data = response.text resumetree = etree.HTML(resume_data) resume_download_list = resumetree.xpath('//div[@id="down"]/div[2]/ul/li[1]') # 每个简历对应的点击下载的地址 for download in resume_download_list: all_downloads = download.xpath('./a/@href')[0] resume_rar_page = requests.get(url=all_downloads, headers=headers).content # 向点击下载的url发送请求,把简历下载到本地 resume_path = 'jianliLibs/' + all_titles with open(resume_path, 'wb')as fp: fp.write(resume_rar_page ) print(all_titles, "下载成功!!!")
成功获取