re.text用于文本内容
re.content用于图片、视频、音频等
import requests # 发出http请求 re = requests.get('https://apiv3.shanbay.com/codetime/articles/mnvdu') # 查看响应状态 print('网页的状态码为%s'%re.status_code) with open('鲁迅文章.txt', 'w') as file: # 将数据的字符串形式写入文件中 print('正在爬取小说') file.write(re.text)
import requests # 发出http请求 #下载图片 res=requests.get('https://www.www.zyiz.net/i/ll/?i=20210424184053989.PNG') # 以二进制写入的方式打开一个名为 info.jpg 的文件 with open('datawhale.png','wb') as ff: # 将数据的二进制形式写入文件中 ff.write(res.content)
爬取学校官网一级页面、二级页面和三级页面
import json import requests from bs4 import BeautifulSoup url_list = [] # Get URLs def getURLs (url): html = requests.get(url) soup = BeautifulSoup(html.text, 'html.parser') links = soup.find_all('a') for link in links: url_href = link.get("href") url_list.append(url_href) # Complement relative path url_str = list(map(lambda x: str(x), url_list)) for i in range(len(url_str)): if(len(url_str[i]) <= 0): continue if url_str[i][0] == "/": url_tail = url_str[i][1:] url_str[i] = url + url_tail # Filter out FST web page and Remove duplicated items url_fst = list(set(filter(lambda x: "https://www.fst" in x, url_str))) url_fst = list(set(filter(lambda x: x[-1]=="/", url_fst))) return url_fst # Save the output def save (data): conv = json.dumps(data) f = open(r"C:\Users\Sandra\Desktop\url\url.txt", "a",encoding='UTF-8') f.write(conv+"\n") f.close() # Homepage print("Homepage") result1 = getURLs('https://www.fst.um.edu.mo/') # Subpages print("Subpages") result2=[] for i in result1: if i!="https://www.fst.um.edu.mo/": result2=result2+getURLs(i) # Remove same items in both 1st and 2ed layer links uniq_result2=[] result2_=set(result2)|set(result1) uniq_result2=list(result2_-set(result1)) # Subsubpages print("Subsubpages") result3=[] for j in uniq_result2: result3=result3+getURLs(j) fst_Urls = list(set(result1+uniq_result2+result3)) for item in fst_Urls: save(item) print(len(fst_Urls))