目标网站是药物临床试验登记平台
思路:尝试requests库直接获取该网站的response,失败,因为该网站返回202,需要破解js;然后尝试用chrome driver驱动获取网页数据,失败,因为被识别为恶意爬虫;然后找到了pyppeteer库进行尝试,成功
贴代码:
import asyncio import random from pyppeteer import launch from pyppeteer.network_manager import Response class PyppeteerScript(object): """ pyppeteer """ def __init__(self): self.base_url = "http://www.chinadrugtrials.org.cn/clinicaltrials.searchlistdetail.dhtml" # 从第几个开始爬取 self.current_page = 13480 # 可爬取的最大实验数 退出的条件 self.page_limit = 13483 self.config = { # 无头浏览器设置 显示网页 "headless": False, # 本地Chromium路径 镜像下载:https://npm.taobao.org/mirrors/chromium-browser-snapshots/ "executablePath": "/Users/xxx/Downloads/chromium/Chromium.app/Contents/MacOS/Chromium", # 操作后的延迟时间 防止检测 "slowMo": 5, # 配置参数 "args": [ # 关闭自动化提示框 "--disable-infobars", # 关闭沙盒模式 "--no-sandbox", # 代理 PS:每次需要更换代理就需要重新通过launch方法来启动浏览器 # "--proxy-server=http://ip:port", # 取消显示正在受控制的bars "--disable-infobars", ] } async def intercept_response(self, res: Response): json_text = await res.text() print(json_text) def __save_html(self, content, name): with open("./html/{}.html".format(name), "w+", encoding="utf-8") as f: f.write(content) async def run(self): """ 入口函数 """ browser = await launch(**self.config) page = await browser.newPage() await page.setViewport({'width': 1920, 'height': 1080}) await page.setUserAgent( 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' ); # 混淆识别js代码 防止被识别 js_text = """ () =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }); window.navigator.chrome = { runtime: {}, }; Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); } """ await page.evaluateOnNewDocument(js_text) await page.goto(url=self.base_url) # 等待网页加载(second) await asyncio.sleep(5) while True: if self.current_page > self.page_limit: break # 执行翻页的JS函数 turn_page_js = "gotopage({})".format(self.current_page) await page.evaluate(turn_page_js) await asyncio.sleep(random.randint(2, 4)) self.current_page += 1 # 获取网页源码 page_text = await page.content() page_title = await page.title() self.__save_html(content=page_text, name=page_title) # 没有XHR请求 不会响应response 不能直接获取json # await page.setRequestInterception(True) # page.on('response', self.intercept_response) await browser.close() if __name__ == '__main__': asyncio.get_event_loop().run_until_complete(PyppeteerScript().run())
PS:该代码仅供学习交流,请勿用于其他商业用途