可以使用pip或者直接用pycharm进行下载。
from urllib.request import urlopen from bs4 import BeautifulSoup import datetime import random import pymysql import re
conn=pymysql.connect(host='127.0.0.1',user='root',passwd='自己的密码',db='mysql',charset='utf8') cur=conn.cursor() #sql='''CREATE TABLE pages ( #id BIGINT(7) NOT NULL AUTO_INCREMENT, #title VARCHAR(200), #content VARCHAR (10000), #createed TIMESTAMP DEFAULT CURRENT_TIMESTAMP , #PRIMARY KEY(id))''' #cur.execute(sql) cur.execute("USE mysql")
def store(title, content): cur.execute('INSERT INTO pages (title,content) VALUES ("%s","%s") ',(title,content)) cur.connection.commit() def getContent(articleUrl): html=urlopen(articleUrl) bs=BeautifulSoup(html,'html.parser') title=bs.find('h1').get_text() content=[] for p in bs.select('.article p'): content.append(p.text.strip()) store(title,content)
getContent('https://news.sina.com.cn/c/2022-07-30/doc-imizirav6039897.shtml')
以壁纸网站https://wallhaven.cc为例
原理:因为该壁纸网站是分页的,所以先获取每页的壁纸的url,再统一进行下载
通过class为:preview的a标签获得壁纸的网页链接,
获得壁纸的网页链接后,再通过id为:wallpaper的img标签获取到src进行下载。
代码如下:
from urllib.request import urlopen from bs4 import BeautifulSoup import eventlet import datetime import random import pymysql import re import requests import time def getImageUrl(response): bs=BeautifulSoup(response.text,'html.parser') pagesrcs=[] for i in bs.find_all("a",{"class":"preview"}): time.sleep(1.0) eventlet.monkey_patch() with eventlet.Timeout(2, False): # 设置超时时间为 2秒 response2 = requests.get(i.get('href'), headers=headers) bs2=BeautifulSoup(response2.text,'html.parser') img=bs2.find("img",{'id':"wallpaper"}) if img is not None: print(img.get("src")) pagesrcs.append(img.get("src")) return pagesrcs def downloadImage(srcs): i=1 for src in srcs: time.sleep(1.0) eventlet.monkey_patch() with eventlet.Timeout(20, False): # 设置超时时间为 2秒 r= requests.get(src, stream=True,headers=headers) if r.status_code == 200: open('D:\\Samurai+Champloo\\'+str(i)+'.jpg', 'wb').write(r.content) # 将内容写入图片 i=i+1 print("下载成功") headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36 Edg/103.0.1264.77' , 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'} #分页代码 def searchPaging(startUrl): allsrcs=[] end = 2 page=1 for page in range(1,end): if page==1: url = startUrl response = requests.get(url, headers=headers) srcs=getImageUrl(response) allsrcs+=srcs else: url=startUrl+"&page="+str(page) print(url) response=requests.get(url,headers=headers) srcs = getImageUrl(response) allsrcs+=srcs page=page+1 return allsrcs allsrcs=searchPaging('https://wallhaven.cc/search?q=Samurai+Champloo') downloadImage(allsrcs)