初体验之爬个妹子秀图库的swmt栏,上面的小妹妹…嘶溜针不戳。
不多说,上代码。第一次写,抄了这位佬的代码基本上,感谢这位佬@https://www.kancloud.cn/@noahs
用的python3.7。
会有bug,要是服务器的图片挂了就没法加载了,才疏学浅等会了再来更新代码好了(鸽了
# -*- codeing = utf-8 -*- import requests from bs4 import BeautifulSoup import re import os import sys import time class download_xiuaa(): def __init__(self): #定义基础变量,文件头以及爬虫的网站 self.baseurl = "https://xiuaa.com/swmt/" self.head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46" } self.title_list =[] # def get_data(self): def ask_url(self): #获取目标网站的url信息 rq = requests.get(url = self.baseurl,headers = self.head) bs = BeautifulSoup(rq.text,"html.parser") text = bs.find('div',id = 'container').find_all('a',target = '_blank') # print(text) for i in text: title = i.get_text() url = i.get('href') if title: self.title_list.append({ 'title':title, 'url':url }) def get_maxpage(self): #获取当前图集的最大页数 for thing in self.title_list: urls = 'https://www.xiuaa.com'+thing['url'] rq = requests.get(url=urls, headers =self.head) sp = BeautifulSoup(rq.text, 'html.parser') text = sp.find('div',id = 'pager').find_all('a') # print(text) maxpag = text[0].get_text() thing['url'] = urls thing['maxpag'] = int(maxpag) def get_ever_url(self,dic): #获取图片所有地址 print('下载:%s,\t 页数%s' % (dic['title'], dic['maxpag'])) urls = dic['url'] for i in range(0,int(dic['maxpag'])): findNum = re.finditer(r"\d+", urls) for match in findNum: # print(match.group()) page_url = 'https://www.xiuaa.com/swmt/'+ match.group() +'_'+ str(i) +'.html' rq = requests.get(url =page_url,headers = self.head) bs = BeautifulSoup(rq.text,'html.parser') text = bs.find('div',id = 'bigpic').find_all('img')[0] pic_url = text.get('src') name = re.split('/',pic_url)[4] self.down_pic(pic_url,dic['title'],name) time.sleep(2) sys.stdout.write("\r") sys.stdout.write("%s%% | %s" % (int(i / int(dic['maxpag']) * 100), i * '|')) sys.stdout.flush() def down_pic(self,pic_url,title,name): if not os.path.exists(title): os.mkdir(title) rq=requests.get(url = pic_url,headers = self.head) with open("%s/%s"%(title,name),'wb') as f: f.write(rq.content) f.close() if __name__ == '__main__': dx = download_xiuaa() dic = {'title':'紧臀蓝裙美女Lucy黑丝美腿诱惑','url':'https://www.xiuaa.com/swmt/1823.html','maxpag':'20'} dx.get_ever_url(dic)