1.模拟用户操作获取数据
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/12/2 21:10 # @Author : Lhtester # @Site : # @File : 爬取京东商品.py # @Software: PyCharm import time import pymongo from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from lxml import etree browser = webdriver.Chrome() wait = WebDriverWait(browser,50) db = pymongo.MongoClient(host='', port=27017,username="root",password="123456")["mydb"]['jd'] def search(): browser.get('https://www.jd.com/') try: '''判断是否至少有1个元素存在于dom树中,如果定位到就返回列表''' input_text = wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#key")) ) '''判断某个元素中是否可见并且是enable的,代表可点击''' submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR,"#search > div > div.form > button")) ) input_text[0].send_keys('python') submit.click() #使用CSS3 :nth-child() 选择器 获取总页数 total = wait.until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > em > b") ) ) print(total[0].text) html = browser.page_source # print('see look look ',html) prase_html(html) return total[0].text except TimeoutError: search() def next_page(page_number): try: #滑动到底部,加载出后30个货物信息 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(10) #翻页动作 button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_bottomPage > span.p-num > a.pn-next > em")) ) button.click() wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#J_goodsList > ul > li:nth-child(60)")) ) # print(data) #判断翻页成功 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#J_bottomPage > span.p-num >a.curr"),str(page_number)) ) html = browser.page_source prase_html(html) except TimeoutError: return next_page(page_number) def prase_html(html): print('处理prase') html = etree.HTML(html) items = html.xpath('//li[@class="gl-item"]') for i in range(len(items)): item= {} if html.xpath('//div[@class="p-img"]')[i].get('data-lazy-img') != "done": img =html.xpath('//div[@class="p-img"]//img')[i].get('data-lazy-img') else: img = html.xpath('//div[@class="p-img"]//img')[i].get('src') #图片URL item["img"]= img #标题 item["title"] = html.xpath('//div[@class="p-name"]//em')[i].xpath('string(.)') #价格 item["price"] = html.xpath('//div[@class="p-price"]//i')[i].text #评论 item["commit"] = html.xpath('//div[@class="p-commit"]//a')[i].text save(item) def save(item): try: db.insert_many([item]) print('插入成功') except Exception: print("{}存储到MongoDB失败".format(str(item))) def main(): print("第",1,"页:") total = int(search()) for i in range(2, total +1 ): time.sleep(3) print("第", i, "页:") next_page(i) if __name__ == "__main__": main()
结果如下: