‘
验证码和爬虫之间的爱恨情仇?
识别验证码的操作:
注册
登录
使用接口
from chaojiying import Chaojiying_Client # 导入文件 # 用户中心>>软件ID 里面生成软件 ID chaojiying = Chaojiying_Client('用户名', '密码', '软件ID') # 本地图片文件路径 im = open('图片路径', 'rb').read() # im是图片的所有字节 # 1902 验证码类型 官方网站>>价格体系 print(chaojiying.PostPic(im, 1902)['pic_str']) # 得到字符串类型的验证码
编码流程
爬取某些基于某些用户数据的信息
点击登录按钮之后会发起一个 post 请求
post 请求中会携带登录之前录入的相关的登录信息
http / https 协议特性:无状态信息,即,发起第二次基于个人主页请求的时候,服务器端并不知道该次请求是基于登录状态下的请求
cookie:用来让服务器记录
手动获取:通过抓包工具获取 cookie 值,将该值封装到 headers 中
自动获取:
cookie 来源
session 会话对象
session = requests.Session()
使用 selenium 登录,获取 cookie
from fake_useragent import UserAgent as ua # 随机生成ua import requests def get_cookies(url, name, pwd): from selenium.webdriver import Chrome, ChromeOptions from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys option = ChromeOptions() option.add_argument("--headless") option.add_argument('--disable-gpu') option.add_argument(f'user-agent={ua().random}') web = Chrome(options=option) web.get(url=url) web.find_element(By.XPATH, '//*[@id="username"]').send_keys(name) web.find_element(By.ID, "password").send_keys(pwd, Keys.ENTER) cookies = web.get_cookies() # 得到 cookies web.close() cookie = {} for i in cookies: cookie[i["name"]] = i["value"] return cookie def get_page_source(url, name, password): resp = requests.get(url=url, cookies=get_cookies(url, name, pwd), headers={ "user-agent": ua().random, }) return resp
什么是代理:
代理的作用
构建代理 ip 池
代理 ip 类型:
from requests import get from fake_useragent import UserAgent as ua from pyquery import PyQuery def test_ip(ip): url = "https://www.baidu.com/s?wd=ip" resp = get(url=url, headers={ "user-agent": ua().random, }, proxies={ "https": ip, "http": ip }, timeout=10) doc = PyQuery(resp.text)('tr > td > span[@class="c-gap-right"]').text().split(":")[-1].strip() print(doc)
代理 ip 的匿名度
目的:在爬虫中使用异步实现高性能的数据爬取操作
同步爬虫:
from time import time from requests import get urls = [ "http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg", ] def get_content(url): print("正在获取:", url) resp = get(url=url).content name = url.split("/")[-1] if resp: with open(f"./img/{name}", "wb") as f: f.write(resp) print("下载完成") else: print("请求失败") sta = time() for i in urls: get_content(i) print(f"全部下载完成,用时{time() - sta}")
异步爬虫:
from time import time from requests import get import asyncio urls = [ "http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg", ] async def get_content(url): print("开始下载:", url) name = url.split("/")[-1] loop = asyncio.get_event_loop() fut = loop.run_in_executor(None, get, url) # 同步方法等待 resp = await fut if resp: with open(f"./img/{name}", "wb") as f: f.write(resp.content) print("下载完成") else: print("请求失败") sta = time() tasks = [get_content(i) for i in urls] asyncio.run(asyncio.wait(tasks)) print(f"全部下载完成,用时{time() - sta}")
多线程 ,多进程:
from time import time from requests import get from threading import Thread class MyThread(Thread): def __init__(self, target, args=(), kwargs={}): super(MyThread, self).__init__() self.daemon = True self.target = target self.args = args self.kwargs = kwargs self.start() # 自动开启进程 def run(self): self.target(*self.args, **self.kwargs) urls = [ "http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg", ] def get_content(url): print("正在获取:", url) resp = get(url=url).content name = url.split("/")[-1] if resp: with open(f"./img/{name}", "wb") as f: f.write(resp) print("下载完成") else: print("请求失败") sta = time() lis = [] for i in urls: mt = MyThread(get_content, args=(i, )) lis.append(mt) for i in lis: i.join() print(f"全部下载完成,用时{time() - sta}")
线程池,进程池:
from time import time from requests import get from concurrent.futures import ThreadPoolExecutor urls = [ "http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg", "http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg", ] def get_content(url): print("正在获取:", url) resp = get(url=url).content name = url.split("/")[-1] if resp: with open(f"./img/{name}", "wb") as f: f.write(resp) print("下载完成") else: print("请求失败") sta = time() with ThreadPoolExecutor(len(urls)) as t: t.map(get_content, urls) print(f"全部下载完成,用时{time() - sta}")
单线程 + 协程(推荐)
import asyncio async def request(url): print("正在请求:", url) print("请求成功") return url # async修饰的函数,调用之后返回的一个协程对象 c = request("www.baidu.com") # # 创建一个任务循环对象 # loop = asyncio.get_event_loop() # # task 的使用:基于 loop 创建一个对象 # task = loop.create_task(c) # print(task) # # 将协程对象注册到 loop 中 # loop.run_until_complete(task) # print(task) # loop = asyncio.get_event_loop() # # future 的使用,不用基于协程对象 # task = asyncio.ensure_future(c) # print(task) # loop.run_until_complete(task) # print(task) # 绑定回调 def callback(task): print(task.result()) loop = asyncio.get_event_loop() task = asyncio.ensure_future(c) # 将回调函数绑定到任务对象中 task.add_done_callback(callback) loop.run_until_complete(task)
import asyncio # 在异步协程中如果出现了同步模块相关的代码,那么久无法实现异步 async def request(url): print("正在下载:", url) await asyncio.sleep(3) print("下载完毕:", url) return url # 任务列表 tasks = [request(i) for i in range(10)] done, pending = asyncio.run(asyncio.wait(tasks)) print(done) #------------------------------ # 也可 tasks = [] for i in range(10): task = asyncio.ensure_future(request(i)) tasks.append(task) loop = asyncio.get_event_loop() done, pending = loop.run_until_complete(asyncio.wait(tasks)) print(done)
aiohttp:基于异步网络请求的模块
注意:在获取响应数据操作之前一定要使用 await 进行手动挂起
text():返回字符串形式的响应数据
read():返回二进制形式的响应数据
json():返回的是 JSON 对象
requests:是基于同步的,必须使用基于异步的网络请求模块进行指定的url的请求发送
selenium 基本使用
问题:selenium 模块和爬虫之间有怎么样的关系?
什么是 selenium 模块?
使用流程:
环境安装:pip install selenium
下载浏览器驱动程序
实例化浏览器对象
编写基于浏览器自动化的操作代码
如果定位的标签是存在于 iframe 标签中的则必须要进入 iframe 标签内在进行定位
# 先通过xpth定位到iframe xf = driver.find_element_by_xpath('//*[@id="x-URS-iframe"]') # 再将定位对象传给switch_to_frame()方法 driver.switch_to_frame(xf) driver.switch_to_default_content() # 退出框架
from selenium.webdriver.common.action_chains import ActionChains # 实例化动作链 act = ActionChains(web) # 点击长按 act.click_and_hold(div) for i in range(5): act.move_by_offset(17, 0).perform() # perform 表示立即执行动作链动作 # 释放动作链 act.release() web.quit()