# -*- coding: utf-8 -*- # http://weixin.sogou.com/ import re import urllib.request import time # sleep()方法 实现延时 import urllib.error # 为使用代理服务器爬一个网址 def use_proxy(proxy_addr,url): # 建立异常处理机制 try: req = urllib.request.Request(url) # 模拟浏览器 req.add_header("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML,like Google Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") proxy = urllib.request.ProxyHandler({'http':proxy_addr}) # 设置代理服务器 opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) data = urllib.request.urlopen(req).read() return data except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) # 若为URLError异常,延时10s执行 time.sleep(10) except Exception as e: print("exception:"+str(e)) # 若为Exception异常,延时1s执行 time.sleep(1) # 设置关键词 key = "Python" # 设置代理服务器,该代理服务器可能失效,失效后需更新有效代理服务器 proxy = "127.0.0.1:8888" # 爬取的页数 for i in range(0,10): key = urllib.request.quote(key) thisPageUrl="http://weixin.sogou.com/weixin?type=2&query=" + key + "&page=" + str(i) # a = "" thisPageData = use_proxy(proxy,thisPageUrl) print(len(str(thisPageData))) pat1 = '<a href="(.*?)"' rs1 = re.compile(pat1,re.S).findall(str(thisPageData)) if(len(rs1) == 0): print("此次("+str(i)+"页)没成功") continue for j in range(0,len(rs1)): thisUrl = rs1[j] thisUrl = thisUrl.replace("amp;","") file = "F:/爬虫信息/result/第"+str(i)+"页第"+str(j)+"篇文章.html" thisData = use_proxy(proxy,thisUrl) try : fh = open(file,"wb") fh.write(thisData) fh.close() print("第"+str(i)+"页第"+str(j)+"篇文章成功!") except Exception as e: print(e) print("第"+str(i)+"页第"+str(j)+"篇文章失败!")
问题如下: