import urllib.request
import urllib.parse
import re
import os
#添加header,referer是必须的User-Agent用来伪装浏览器
header=\
{
'User-Agert':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
"referer":"http://image.baidu.com"
}
#keword=input(“请输入搜索关键字”)
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E4%B9%94%E5%B8%83%E6%96%AF&oq=%E4%B9%94%E5%B8%83%E6%96%AF&rsp=-1"##.format(word=keword) [http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}](http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%7bword%7d)
#转码
\#keword=urllib.parse.quote(keword,"utf-8")
n=0
j=0
while n<3000:
error=0
n+=30
#获取请求
rep=urllib.request.Request(url,headers=header)
#打开网页
rep=urllib.request.urlopen(rep)
#获取网页内容
try:
html=rep.read().decode("utf-8")
#print(html)
except:
print("出错了")
error=1
#正则匹配
p=re.compile(r"thumbURL.*?\.jpg")
#获取正则匹配到的结果,返回list
s=p.findall(html)
if os.path.isdir("D://text_pic") !=True:
os.makedirs("D://text_pic")
with open("testpic.txt","a") as f:
#获取图片
for i in s:
i=i.replace('thumbURL":"',"")
print(i)
f.write(i)
f.write("\n")
#保存图片
urllib.request.urlretrieve(i,"D://text_pic/pic{num}.jpg".format(num=j))
j+=1
f.close()
print("总共爬取的图片数为:"+str(j))
`import urllib.request` `import urllib.parse` `import re` `import os` \#添加header,referer是必须的User-Agent用来伪装浏览器 `header=\` `{` `'User-Agert':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',` `"referer":"http://image.baidu.com"` `}` \#keword=input("请输入搜索关键字") `url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E4%B9%94%E5%B8%83%E6%96%AF&oq=%E4%B9%94%E5%B8%83%E6%96%AF&rsp=-1"##.format(word=keword) [http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}](http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%7bword%7d)` \#转码 `\#keword=urllib.parse.quote(keword,"utf-8")` `n=0` `j=0` `while n<3000:` `error=0` `n+=30` #获取请求 `rep=urllib.request.Request(url,headers=header)` #打开网页 `rep=urllib.request.urlopen(rep)` #获取网页内容 `try:` `html=rep.read().decode("utf-8")` `#print(html)` `except:` `print("出错了")` `error=1` #正则匹配 `p=re.compile(r"thumbURL.*?\.jpg")` #获取正则匹配到的结果,返回list `s=p.findall(html)` `if os.path.isdir("D://text_pic") !=True:` `os.makedirs("D://text_pic")` `with open("testpic.txt","a") as f:` #获取图片 `for i in s:` `i=i.replace('thumbURL":"',"")` `print(i)` `f.write(i)` `f.write("\n")` #保存图片 `urllib.request.urlretrieve(i,"D://text_pic/pic{num}.jpg".format(num=j)` `j+=1` `f.close()` `print("总共爬取的图片数为:"+str(j))`