Java教程

实习爬虫示例

本文主要是介绍实习爬虫示例,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
from lxml import etree
from pyquery import PyQuery as pq

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
url = 'https://www.medchemexpress.cn/mce_category.shtml?independentCateName=Inhibitory%20Antibodies'

html = requests.get(url, headers=headers).text
tree = etree.HTML(html)
div_list = tree.xpath('//*[@id="page_table_1"]/li')

list_all = []
for div in div_list:
    list1 = []
    mulu = div.xpath('./dl/dt//text()')
    name = div.xpath('./dl/dd/table/tr[1]/th[1]/a/strong/text()')
    jianjie = div.xpath('./dl/dd/table/tr[2]/td/text()')
    for i in range(len(mulu)):
        list1.append(pq(mulu[0]).text())
        list1.append(pq(name[0]).text())
        list1.append(pq(jianjie[0]).text())
    list_all.append(list1)
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import codecs
from lxml import etree
import csv
import re
import time
import requests
import json
from pyquery import PyQuery as pq
import pandas as pd


headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
proxies = [{'HTTP': '58.252.195.180:9999'}, {'HTTP': '49.70.89.159:9999'}, {'HTTP': '112.91.79.12:9999'}, {'HTTP': '110.18.155.171:9999'}, {'HTTP': '113.128.29.71:9999'}, {
    'HTTP': '27.156.195.104:9999'}, {'HTTP': '121.205.177.96:9999'}, {'HTTP': '163.204.246.82:9999'}, {'HTTP': '114.233.136.28:9999'}, {'HTTP': '42.176.132.253:9999'}]
url = 'https://www.glpbio.com/research-area/proteases/caspase.html'

page_text = requests.get(url=url, headers=headers, proxies=proxies[1]).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//*[@id="products-list"]/li')
list_all = []
for div in div_list:
    list1_item = []
    name = div.xpath('./span[2]/a')
    name = pq(name).text()
    print(name)
    url = div.xpath('./span[2]/a/@href') #获取目标网址列表, 每次一个
    if len(url) == 0:
        continue
    # print(url)
    
    html = requests.get(url[0], headers=headers, proxies=proxies[2]).text
    time.sleep(3)
    tree1 = etree.HTML(html) #获取一个目标网址下的所有html
    
    guige_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
    guige_list = []
    for i in guige_all:
        guige = i.xpath('./td[1]')
        guige = pq(guige).text()
        guige_list.append(guige)
    print(guige_list)

    price_list = []
    price_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
    for i in price_all:
        price = i.xpath('./td[2]/div/span')
        price = pq(price).text()
        price_list.append(price)
    print(price_list)
    stock_list = []
    stock_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
    for i in stock_all:
        try:
            stock = i.xpath('./td[3]')
        except:
            continue
        stock = pq(stock).text()
        stock_list.append(stock)
    print(stock_list)
    
    name_list = []
    for i in range(len(stock_list)):
        name_list.append(name)
    
    lists = list(zip(name_list, guige_list, price_list, stock_list))
    list_all.extend(lists)

这篇关于实习爬虫示例的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!