通过request拿到网页原源代码
通过re来提取想要的有效信息
import requests import re
请求头
headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (HTML, " "like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36 " }
预编译所有re表达式
sub = re.compile(r'<li id=".*?"><i class=".*?"></i><a href="(?P<content_url>.*?)" target=".*?" title="(' r'?P<title>.*?)">.*?</a><span>(?P<time>.*?)</span></li>', re.S) detail_1 = re.compile(r'<div id="vsb_content_501">(?P<detail>.*?)<div id="div_vote_id">', re.S) detail_1 = re.compile(r'<img border="0".*"></p>(?P<detail>.*?)<div id="div_vote_id">', re.S)
for i in range(389, 0, -1):
在所写文档中将每页区内容区分开
news = open("科院新闻.txt", "a", encoding='utf-8') news.write("*"*10) news.write("第"+str(390-i)+"页") news.write("*"*10) news.write("\n"*2) news.close()
在控制台输出进度
print("第"+str(390-i)+"页新闻下载中")
获取每一页的源代码
url = "https://news.hist.edu.cn/kyyw/"+str(i)+".htm" resp = requests.get(url, headers=headers) content = resp.content.decode()
匹配所需内容
result = sub.finditer(content)
调出并使用匹配到的内容
for it in result: title = it.group("title") time = it.group("time") half = it.group("content_url")
拼出接下来要前往的网页
whole = "https://news.hist.edu.cn/" + half
爬取新的网页的源代码
article = requests.get(whole, headers=headers) detail_2 = article.content.decode()
匹配所需内容
result2 = detail_1.finditer(detail_2)
创建科院新闻txt文档,
a:打开一个文件用于追加
news = open("科院新闻.txt", "a", encoding='utf-8')
调出并处理匹配到的内容
for mor in result2: content = mor.group("detail")
进行数据清洗
content = content.replace('(0, 0, 0)', '') content = content.replace('&', '') content = content.replace('<', '') content = content.replace('>', '') content = content.replace('/', '') content = content.replace('a', '') content = content.replace('b', '') content = content.replace('c', '') content = content.replace('d', '') content = content.replace('e', '') content = content.replace('f', '') content = content.replace('g', '') content = content.replace('h', '') content = content.replace('i', '') content = content.replace('j', '') content = content.replace('k', '') content = content.replace('b', '') content = content.replace('l', '') content = content.replace('m', '') content = content.replace('n', '') content = content.replace('o', '') content = content.replace('p', '') content = content.replace('q', '') content = content.replace('r', '') content = content.replace('s', '') content = content.replace('t', '') content = content.replace('u', '') content = content.replace('v', '') content = content.replace('w', '') content = content.replace('x', '') content = content.replace('y', '') content = content.replace('z', '') content = content.replace('=":', '') content = content.replace(';-:', '') content = content.replace('=":', '') content = content.replace(': "', '') content = content.replace('="-', '') content = content.replace('="_"', '') content = content.replace('\r\n', '')
写入标题,内容,时间
news.write("《") news.write(title) news.write("》") news.write("\n") news.write(content) news.write("\n") news.write(time) news.write("\n" * 3)
当我们写文件时,操作系统往往不会立刻把数据写入磁盘,而是放到内存缓存起来,空闲的时候再慢慢写入。
只有调用close()方法时,操作系统才保证把没有写入的数据全部写入磁盘。忘记调用close()的后果是数据可能只写了一部分到磁盘,剩下的丢失了。
news.close()
自己看完视频敲得,有很多操作可能有些沙雕,欢迎大佬在评论区指手画脚