熟悉Ajax的使用
以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法。这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来。
(1)打开今日头条首页https://www.toutiao.com/,搜索框里输入“街拍”
(2)得到如下搜索结果
(2)转到图片,这时打开开发者工具,选择网络——>XHR,查看URL的构成,
https://so.toutiao.com/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&source=search_subtab_switch&dvpf=pc&aid=4916&page_num=1&rawJSON=1&search_id=202204271935490101501350305E56B428
通过分析,我们发现,这里变化的只有page_num.
我们从这里也能找到user-agent,cookies等信息,可以构成请求头。
(5)打开预览选项卡查看,我们要爬取的图片信息就在该json文件的rawData中,而其中的img_url就是我们要爬取的图片链接。
(1)导入需要的包
import requests,os #os用于创建文件
from urllib.parse import urlencode #解决编码问题
import urllib.parse
from hashlib import md5 #检测是否有重复文件
(2)设置请求头
headers={
'host':'so.toutiao.com',
'Referer':'https://so.toutiao.com/search?keyword=%E8%A1 \
%97%E6%8B%8D&pd=atlas&source=search_subtab_switch&dvpf=pc&aid=4916&page_num=0',
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Ne \
xus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like \
Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36 Edg/100.0.1185.50',
'X-Requested-With':'XMLHttpRequest', # 利用Ajax爬取需要手动设置这一参数
'Cookie':'msToken=-tbIPWGnxn9IkPce9TkVKQOOCbS996FTCPCv4ZyEEMq31aG6VVx9v7 \
uhqUjdE9VmR_7OJSKeN8M-Mk4bLG4BPdr8T-WNwYnzJoM7A8ucM0Ko; tt_webid=7089706 \
220605261320; _S_IPAD=0; MONITOR_WEB_ID=7089706220605261320; PIXIEL_RATI \
O=2.0000000298023224; FRM=new; ttcid=4c3ed44c1a63414dba951d23262dc98040 \
; WIN_WH=320_658; tt_scid=-HSOw6HU9KYC0P6N6Hty6tR6HH6XDaCuaXSnNvT29cl65Q \
iFXXhHZdUDwp1aKBH77ccb; ttwid=1%7CUvC_j34tZR9J0iHJqed1wxypB7iXOAO3MBUcKQ \
1guHs%7C1651059027%7Cabb8d559145a8f3e00a3b83a5e8e2150db22aa339e83af34baa \
ef712abb04117; _S_WIN_WH=1536_746; _S_DPR=1.25'
}
(3)获取网页
def get_page(page_num):
params = { #url较长,可设置params的参数
'keyword':urllib.parse.unquote('%E8%A1%97%E6%8B%8D'),
'pd':'atlas',
'source':'search_subtab_switch',
'dvpf':'pc',
'aid':4916,
'page_num':page_num,
'rawJSON':1,
'search_id':'202204271935490101501350305E56B428'
}
base_url = 'https://so.toutiao.com/search?'
url = base_url + urlencode(params) #构成完整的URL
try:
resp = requests.get(url,headers=headers)
if 200 == resp.status_code: #若连接成功,则以json格式返回响应
return resp.json()
except requests.ConnectionError as e: #如果出错,则输出错误信息
print('error:',e) #返回错误信息的元组
return None
(4)获取图片信息
def get_images(json):
images=json.get('rawData').get('data')
for image in images:
title = image.get('text')
link = image.get('img_url')
yield {
'image': image,
'link': link
}
(5)处理文件目录
def Mulu(image):
text=image.get('link')
ls=[]
for item in text:
if item.isdigit() or item.isalpha():
ls.append(item)
im=''.join(ls)
return im
(6)保存图片
def save_image(im,item):
if not os.path.exists(im):
os.mkdir(im)
try:
response = requests.get(item.get('link')).content
print(response)
file_path = '{0}/{1}.{2}'.format(im, md5(response).hexdigest(), 'jpg')
with open(file_path, 'wb') as f:
f.write(response)
except requests.ConnectionError:
print('Failed to Save Image')
(7)主函数
def main(page):
json = get_page(page)
for item in get_images(json):
#print(item) #调试信息,调试结束后应注释掉
im=Mulu(item)
save_image(im,item)
print('图片保存完毕') #提示信息,调试结束后保留
if __name__=="__main__":
for i in range(0,2):
main(i)
(8)完整代码
import requests,os #os用于创建文件
from urllib.parse import urlencode #解决编码问题
import urllib.parse
from hashlib import md5 #检测是否有重复文件
headers={
'host':'so.toutiao.com',
'Referer':'https://so.toutiao.com/search?keyword=%E8%A1 \
%97%E6%8B%8D&pd=atlas&source=search_subtab_switch&dvpf=pc&aid=4916&page_num=0',
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Ne \
xus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like \
Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36 Edg/100.0.1185.50',
'X-Requested-With':'XMLHttpRequest', # 利用Ajax爬取需要手动设置这一参数
'Cookie':'msToken=-tbIPWGnxn9IkPce9TkVKQOOCbS996FTCPCv4ZyEEMq31aG6VVx9v7 \
uhqUjdE9VmR_7OJSKeN8M-Mk4bLG4BPdr8T-WNwYnzJoM7A8ucM0Ko; tt_webid=7089706 \
220605261320; _S_IPAD=0; MONITOR_WEB_ID=7089706220605261320; PIXIEL_RATI \
O=2.0000000298023224; FRM=new; ttcid=4c3ed44c1a63414dba951d23262dc98040 \
; WIN_WH=320_658; tt_scid=-HSOw6HU9KYC0P6N6Hty6tR6HH6XDaCuaXSnNvT29cl65Q \
iFXXhHZdUDwp1aKBH77ccb; ttwid=1%7CUvC_j34tZR9J0iHJqed1wxypB7iXOAO3MBUcKQ \
1guHs%7C1651059027%7Cabb8d559145a8f3e00a3b83a5e8e2150db22aa339e83af34baa \
ef712abb04117; _S_WIN_WH=1536_746; _S_DPR=1.25'
}
def get_page(page_num):
params = { #url较长,可设置params的参数
'keyword':urllib.parse.unquote('%E8%A1%97%E6%8B%8D'),
'pd':'atlas',
'source':'search_subtab_switch',
'dvpf':'pc',
'aid':4916,
'page_num':page_num,
'rawJSON':1,
'search_id':'202204271935490101501350305E56B428'
}
base_url = 'https://so.toutiao.com/search?'
url = base_url + urlencode(params) #构成完整的URL
try:
resp = requests.get(url,headers=headers)
if 200 == resp.status_code: #若连接成功,则以json格式返回响应
return resp.json()
except requests.ConnectionError as e: #如果出错,则输出错误信息
print('error:',e) #返回错误信息的元组
return None
def get_images(json):
images=json.get('rawData').get('data')
for image in images:
title = image.get('text')
link = image.get('img_url')
yield {
'image': image,
'link': link
}
def Mulu(image):
text=image.get('link')
ls=[]
for item in text:
if item.isdigit() or item.isalpha():
ls.append(item)
im=''.join(ls)
return im
def save_image(im,item):
if not os.path.exists(im):
os.mkdir(im)
try:
response = requests.get(item.get('link')).content
print(response)
file_path = '{0}/{1}.{2}'.format(im, md5(response).hexdigest(), 'jpg')
with open(file_path, 'wb') as f:
f.write(response)
except requests.ConnectionError:
print('Failed to Save Image')
def main(page):
json = get_page(page)
for item in get_images(json):
#print(item) #调试信息,调试结束后应注释掉
im=Mulu(item)
save_image(im,item)
print('图片保存完毕') #提示信息,调试结束后保留
if __name__=="__main__":
for i in range(0,2):
main(i)