1、环境安装
pip install lxml
2、解析原理
3、实战案例
- 项目需求:解析房天下新房的相关数据
import requests import os from lxml import etree import json import csv if __name__ == '__main__': url = 'https://huizhou.newhouse.fang.com/house/s/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' } if not os.path.exists('./fangtianxiaLibs'): os.makedirs('./fangtianxiaLibs') response = requests.get(url=url,headers=headers) # 手动设置响应数据的编码格式 response.encoding = 'utf-8' page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@id = "newhouse_loupai_list"]/ul/li') # 爬取的数据信息放到列表里面 datas = [] for li in li_list: # 解析标题 try: detail_url = li.xpath('.//div[@class="nlcd_name"]/a/@href')[0] if detail_url != []: detail_url = 'https:'+detail_url detail_text = requests.get(url=detail_url,headers=headers).text # 字符串替换Url后缀 detail_url_new = detail_url.replace('.htm','/housedetail.htm') tree = etree.HTML(detail_text) # 解析二级页面的描述和价格(均价) title = tree.xpath('//div[@class="information"]//div[@class="tit"]/h1/strong/text()')[0] price = "".join(tree.xpath('//div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/h3/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/span/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/text()')).strip('\n \t') # 二级页面再次发起请求 detail_text_new = requests.get(url=detail_url_new,headers=headers).text tree_new = etree.HTML(detail_text_new) # 解析详情页信息 tree_list = tree_new.xpath('//div[@id="Configuration"]') # print(tree_list[0].xpath('./h3/text()')) for index in tree_list: zhoubian = "".join(index.xpath('./h3/text()')).strip('\n \t \r ') jiaotong = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li[@class="jiaotong_color"]/text()')).strip('\n \t \r ') qita = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li/text()')).strip('\n \t \r ') desc = zhoubian+":"+jiaotong+':'+qita+'\n' dic = { 'title':title, 'desc':desc, 'price':price } datas.append(dic) except Exception as msg: pass # print('报错原因:{}'.format(msg)) fileName = './fangtianxiaLibs/'+title+'.txt' print(datas) title_header = ['title','desc','price'] with open(fileName,'a',encoding='utf-8') as fp: writer = csv.DictWriter(fp,title_header) writer.writeheader() writer.writerows(datas)
- 项目需求:解析图片数据:http://pic.netbian.com/4kmeinv/
import requests from lxml import etree url = 'http://pic.netbian.com/4kmeinv/' headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } response = requests.get(url=url,headers=headers) #获取页面原始编码格式 print(response.encoding) page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0] img_name = li.xpath('./a/img/@alt')[0] img_name = img_name.encode('iso-8859-1').decode('gbk') print(img_url,img_name) - 项目需求:解析出所有城市名称https://www.aqistudy.cn/historydata/
import requests from lxml import etree url = 'https://www.aqistudy.cn/historydata/' headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } response = requests.get(url=url,headers=headers) #获取页面原始编码格式 print(response.encoding) page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul//li') for li in li_list: city_name = li.xpath('./a/text()')[0] city_url = 'https://www.aqistudy.cn/historydata/'+li.xpath('./a/@href')[0] print(city_name,city_url)
- 项目需求:下载网站站点简历中的图片数据:https://sc.chinaz.com/
import requests from lxml import etree import os # 新建文件夹 if not os.path.exists('./jianliLibs'): os.makedirs('./jianliLibs') # 站点第一层 进入简历门户站点 url = 'https://sc.chinaz.com/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' } response_text = requests.get(url=url,headers=headers).text # 解析获取模板信息 tree = etree.HTML(response_text) # 解析出简历模板Url def page_index(latest): for index in range(1,latest): if index == 1: muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3] # print("one",muban_url) else: muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3] + 'index_{}.html'.format(index) # print("two",muban_url) # 模板简历站点获取每个简历的信息 response = requests.get(muban_url,headers=headers) # 手动设置响应数据的编码格式 response.encoding = 'utf-8' muban_text = response.text # print(muban_text) # 解析获取简历信息 jianli_tree = etree.HTML(muban_text) # 解析出简历信息的Url jianli_url_list = jianli_tree.xpath('//div[@class="main_list jl_main"]//a/@href') # print(jianli_url_list) for jianli_url in jianli_url_list: jianli_url = "https:"+jianli_url # print(jianli_url) # 第三层获取简历信息 jianli_detail = requests.get(jianli_url,headers=headers).text detail_tree = etree.HTML(jianli_detail) img_src_list = detail_tree.xpath('//div[@class="show_warp jl_warp clearfix"]//img/@src') for img_src in img_src_list: img_src = 'https:'+img_src # print(img_src) img_src_content = requests.get(img_src,headers=headers).content # print(img_src_content) # 生成图片的名称 imgName = img_src.split('/')[-2] # print(imgName) # 图片路径 imgPath = './jianliLibs/'+imgName+'.jpg' # 持久化存储 with open(imgPath, 'wb') as fp: fp.write(img_src_content) print('简历:'+imgName, '下载成功!!!') if __name__ == '__main__': while True: try: values = int(input('请输入站点页分页数:')) page_index(values) except Exception as msg: print('输入错误,错误信息为{}'.format(msg)) finally: break