报错的原始方法:
1)使用request.Request,出现上述错误。html无法爬取
from urllib import request def get_html(self, url): print(url) req = request.Request(url=url, headers={'User-Agent': random.choice(ua_list)}) res = request.urlopen(req) # html = res.read().decode() html = req.read().decode("gbk", 'ignore') with open(filename, 'w') as f: f.write(html) self.parse_html(html)
解决方法:
1)将urllib.request 换成requests库,需要重新安装。
2)具体原因,我也不清楚。
import requests def get_html(self, url): print(url) req = requests.get(url=url, headers={'User-Agent': random.choice(ua_list)}) req.encoding = 'utf-8' # print(req.text) # res = request.urlopen(req) # html = res.read().decode() # print(req) # html = req.read().decode("gbk", 'ignore') # print(html) # 直接调用解析函数 # filename = '123456.html' # with open(filename, 'w') as f: # f.write(html) self.parse_html(req.text)