pip install lxml
from lxml import etree
1、将本地的html文件中的源码加载到etree对象中
etree.parse(filepath)
2、将互联网获取的源码加载到该对象中
etree.HTML(page_text)
import requests from lxml import etree from fake_useragent import UserAgent url = 'https://cq.58.com/ershoufang/' headers = { 'user-agent': UserAgent().random } response = requests.get(url=url, headers=headers) page_text = response.text tree = etree.HTML(page_text) room_list = tree.xpath("//section[@class='list']/div/a/div[2]") for room in room_list: title = room.xpath(".//div[@class='property-content-title']//h3/text()")[0] price = room.xpath( ".//div[@class='property-price']//span[@class='property-price-total-num']/text()")[0] avg = room.xpath(".//p[@class='property-price-average']/text()")[0] print('标题: {0} 总价: {1}万元 均价: {2}元/m2'.format(title, price, avg))
from lxml import etree import requests from fake_useragent import UserAgent import time url = 'https://pic.netbian.com/e/search/result/' ua = UserAgent() picture_index = [] # 保存图片下载主页地址 # 彼岸图网图片爬取 def pic_down(page, searchid): param = { 'page': page, 'searchid': searchid } headers = {'User-Agent': ua.random} # 随机UA response = requests.get(url=url, headers=headers, params=param) if response.status_code != 200: print("当前状态码为: ", response.status_code) return False page_text = response.text # 爬取当前页所有照片的主页链接 index_etree = etree.HTML(page_text) index_list = index_etree.xpath("//ul[@class='clearfix']/li/a/@href") for picture_index_url in index_list: headers = {'User-Agent': ua.random} # 随机UA pic_response = requests.get(url=picture_index_url, headers=headers) pic_etree = etree.HTML(pic_response.text) # 图片的链接 pic_link = 'https://pic.netbian.com/' + \ pic_etree.xpath("//a[@id='img']/img/@src")[0] fp.write(pic_link+'\n') print(pic_link) print('成功爬取第 {} 页\n', page) return True if __name__ == '__main__': fp = open('图片链接.txt', 'w', encoding='utf-8') for i in range(0, 5): if pic_down(i, 16): time.sleep(3) else: print('爬取失败') break fp.close()
import requests from lxml import etree from fake_useragent import UserAgent ua = UserAgent() url = 'https://sc.chinaz.com/jianli/free.html' # "//div[@id='container']/div/a/@href" headers = { 'user-agent': ua.random } response = requests.get(url=url, headers=headers) page_text = response.content index_tree = etree.HTML(page_text) index_link = index_tree.xpath("//div[@id='container']/div/a/@href") for link in index_link: # "https://"+link 为主页链接 rel_link = "https:"+link # //div[@class='clearfix mt20 downlist']/ul/li/a/@href response = requests.get(url=rel_link, headers=headers) page_text = response.content download_tree = etree.HTML(page_text) download_link = download_tree.xpath( "//div[@class='clearfix mt20 downlist']/ul/li/a/@href") print(download_link[0])
版权属于:瞌学家 所有,转载请注明出处
本文链接:https://songonline.top/archives/139/
友情提示: 如果文章部分链接出现404,请留言或者联系博主修复。