这里只是代码展示,且复制后不能直接运行,需要配置一些设置才行,具体请查看下方链接介绍:
Python爬取 | 唯美女生图片
from selenium import webdriver from fake_useragent import UserAgent from pyquery import PyQuery as pq import winreg from time import sleep import time import requests import re import os header = { 'User-Agent': UserAgent().random } ''' 获取单个id的HTML代码并解析,返回id的分类、名字、包含图片链接的list ''' def html_id(id_url): r = requests.get(id_url, headers=header) time.sleep(0.3) doc = pq(r.text) classical = doc('.d-md-inline-block').eq(0).children('a').text() # 获取id的分类 if len(classical) != 0: # 判断id的分类是否获取成功,即判断该id的源码是否获取成功 name = doc('.post-title').text() # id 名字 lists = doc('.nc-light-gallery a').items() # id的图片所在标签 links = ['https:' + i.attr('href') for i in lists if '.' in i.attr('href')] # 解析标签,获取图片链接 if len(links) == 0: # 几年前的id,图片所在的标签与前面的不同,所以需要重新解析 lists = doc('.nc-light-gallery img').items() links = ['https:' + i.attr('src') for i in lists if '.' in i.attr('src')] return [classical, name, links] else: # id 对应链接源码获取失败 d = id_url.split('/')[-1].split('.')[0] # 获取为成功获取源码的id print(f'{d} 获取失败,等待下一次循环') return 0 ''' 下载图片 ''' def download(id, con, path, path3): num = 1 # 用于下载的图片计数 classical = con[0] # id 分类 name = con[1] # id 名字 links = con[2] # id 所含图片链接 print(f'{id} {classical} {name} 下载中...', end=' ') img_path = path + '\\' + classical # 创建对应分类的文件夹 if not os.path.exists(img_path): # 判断文件夹是否创建 os.mkdir(img_path) print(f'共{len(links)}张 ——> ', end='') for j in links: # 遍历列表,下载 names = img_path + '\\' + name + str(num) + os.path.splitext(j)[1] # 文件名变量 if 't.cdn.ink' not in j: # 判断图片链接是否规范,后面有些图片的链接是不规范的 j = j[:6] + '//t.cdn.ink/' + j[6:] try: with open(names, 'wb') as f: # 下载 f.write(requests.get(j, headers=header).content) print(f'{num} ', end='') except Exception as e: print(f'\n第{num}张下载错误,错误来自:{e} ') num = num + 1 # 计数 # 将下载过的ID写入id_haven.txt 文件中 with open(path3, 'a+', encoding='utf-8') as f: s = classical + ',' + name + ',' + id + '\n' f.write(s) print('下载完成!!!') ''' 从TXT文件里获取ID,并返回列表 ''' def txt_id(path): if 'haven' in path: # 从id_haven.txt TXT文件里获取已下载的ID id_haven = [] if os.path.exists(path): with open(path, 'r', encoding="ISO-8859-1") as f: a = f.readlines() for i in a: id_haven.append(i.split(',')[-1].strip()) return id_haven else: with open(path, 'r') as f: # 从id_all.txt 和 id_not.txt TXT文件里获取已下载的ID id_all = f.readlines() id_all = [int(i.rstrip()) for i in id_all] id_all.sort(reverse=True) # 排序 id_all = [str(i) for i in id_all] return id_all ''' 保存html页面源代码,并获取html里的所有id ''' def get_id(html, path): # 保存HTML源代码 path_html = path + r'\html源代码' # 源代码保存路径 if not os.path.exists(path_html): # 创建路径文件夹 os.mkdir(path_html) with open(path_html + r'\vm_girls.html', 'w', encoding='utf-8') as f: # 写入vm_girls.html文件中 f.write(html) # 开始解析源代码里的id doc = pq(html) a_html = doc('.media-3x2 a') # 解析的id存在于每个a标签的href属性里,所有的属性值解析到一个列表里 ids = [] for i in a_html: url = pq(i).attr('href') id = re.search('\d+', url.split('/')[-1]).group() # 用正则表达式读取id ids.append(int(id)) ids.sort() # 将id从小到大排序 ids = [str(i) for i in ids] with open(path + r'\ID_all.txt', 'w') as f: f.write('\n'.join(ids)) with open(path + r'\ID_not.txt', 'w') as f: f.write('\n'.join(ids)) ''' 获取加载页面全部源代码 ''' def get_html(url, chromedriver_path): wb = webdriver.Chrome(executable_path=chromedriver_path) wb.implicitly_wait(5) wb.get(url) start_time = time.time() # wb.find_element_by_class_name('nwmb-vdprs-close').click() #用于初次加载界面时弹出的广告框 flag = True # 如果等得不耐烦,任意按下键盘的一个按键,即可加载终止,开始后面的程序 wb.execute_script(''' document.body.addEventListener("keypress", function(){ document.getElementsByClassName('dposts-ajax-load')[0].innerText='加载终止'; }); ''') while flag: try: end = wb.find_element_by_class_name('dposts-ajax-load').text if end in ['没有更多内容', '加载终止']: print(end) flag = False else: wb.find_element_by_class_name('dposts-ajax-load').click() except: sleep(1) finally: wb.execute_script("window.scrollTo(0, document.body.scrollHeight-1532)") # 这里的1532,可能需要对于不同窗口的电脑,做适度调整 html = wb.page_source print(wb.title) wb.quit() end_time = time.time() times = end_time - start_time print(f'加载内容总耗时{times // 60:.0f}分{times % 60:.2f}秒!') return html ''' 获取当前电脑桌面路径 ''' def get_desktop(): key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r'Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders') # 利用系统的链表 zm = winreg.QueryValueEx(key, "Desktop")[0] # 获取的是Unicode类型数据 return str(zm) # Unicode转化为str并返回 def main(): url = 'https://www.vmgirls.com/' # url链接 path = get_desktop() + r'\vmGirls' if not os.path.exists(path): # 创建路径文件夹 os.mkdir(path) chromedriver_path = get_desktop() + r'\chromedriver.exe' # 浏览器驱动器路径 judge = True if os.path.exists(path + r'\html源代码\vm_girls.html'): judge = input('html源代码已存在,是否需要重新加载:') if judge == '否': judge = False else: judge = True if judge: html = get_html(url, chromedriver_path) # 自动获取html源代码 get_id(html, path) # 保存源代码并解析源代码里的所有id path1 = path + '\\ID_all.txt' # 保存解析的所有id path2 = path + '\\ID_not.txt' # 保存未下载的所有id path3 = path + '\\ID_haven.txt' # 保存已下载的所有id # 全ID自动遍历下载 id_not = txt_id(path2) id_haven = txt_id(path3) cycle = 0 # 计循环次数 start_time = time.time() while len(id_not) > 5: cycle += 1 id_all_1 = txt_id(path1) id_all_2 = txt_id(path1) for i in set(id_haven): # 在存在列表里检查ID是否已存在 id_all_1.remove(i) for i in id_all_1: # 下载未下载的ID id_url = url + i + '.html' con = html_id(id_url) if con: # 判断此id的HTML界面是否获取成功 download(i, con, path, path3) all_haven = txt_id(path3) remain = len(id_all_2) - len(all_haven) print(f'第{cycle}次循环,还剩下{remain}个ID未下载!') for i in set(all_haven): # 在存在列表里检查ID是否已存在 id_all_2.remove(i) with open(path2, 'w') as f: # 未下载的ID存入id_not.txt文件 f.write('\n'.join(id_all_2)) time.sleep(2) else: print('结束') end_time = time.time() times = end_time - start_time print(f'下载总耗时{times // 60:.0f}分{times % 60:.2f}秒!') if __name__ == '__main__': main()