import re import requests url = 'https://tool.lu/ip' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62' } proxies ={ 'http':'47.243.190.108:7890', 'https':'47.243.190.108:7890' } res = requests.get(url=url,headers=headers,proxies=proxies,timeout=3).text ip = re.search(r'[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}',res,flags=re.S).group(0) print(ip)
scrapy startproject myspider
cd myspider scrapy genspider -t crawl SpiderName DomaimName
ps:上面加的参数 -t crawl 是创建crawlspider,里面的Rule方法可以帮助提取url
例:笔趣阁,全站爬取
scrapy crawl SpiderName
scrapy stratproject myspider
request.meta['proxy'] = 'https://' + 'ip:port'
from scrapy import cmdline cmdline.execute('scrapy crawl xt'.split())
fake_useragent 是一个包,pip install一下
from fake_useragent import UserAgent class DouyinxingtuUserAgentDownloaderMiddleware: def process_request(self, request, spider): agent = UserAgent(path='fake_useragent_0.1.11.json').random request.headers['User-Agent'] = agent
class DouyinxingtuProxiesDownloaderMiddleware: def process_request(self, request, spider): porixList = getIp() self.porix = random.choice(porixList) # 116.208.24.72:8118 request.meta['proxy'] ='https://'+self.porix print(request.meta) # 如果报错,就返回 def process_exception(self, request, exception, spider): print('删除数据库的值') return request
上面是直接从浏览器复制的,将字符串转换成字典
class DouyinxingtuCookieDownloaderMiddleware: def process_request(self, request, spider): cookie = self.get_cookie() cookies = dict([l.split("=", 1) for l in cookie.split("; ")]) request.cookies=cookies
这个是pipelines里面的类名
# mysql数据库存储 class DouyinxingtuPipelineMysqlSave: fp=None def open_spider(self,spider): print('爬虫开始') # 连接数据库 pass def process_item(self,item,spider): print(item) # 这个是items中的item pass def close_spider(self,spider): print('爬虫结束') pass
引入类,并赋值
item = DouyinxingtuItem() item['data'] = response['data']
items
保存数据库(mysql,原生sql)