1 import re 2 import requests 3 from bs4 import BeautifulSoup 4 import lxml 5 import traceback 6 import time 7 import json 8 from lxml import etree 9 def get_paper(): 10 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/267_ECCV_2020_paper.php 11 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/283_ECCV_2020_paper.php 12 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php 13 url='https://www.ecva.net/papers.php' 14 headers = { 15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' 16 } 17 response=requests.get(url,headers) 18 response.encoding='utf-8' 19 page_text=response.text 20 #输出页面html 21 # print(page_text) 22 soup = BeautifulSoup(page_text,'lxml') 23 all_dt=soup.find_all('dt',class_='ptitle') 24 #暂存信息 25 temp_res=[] 26 #最后结果集 27 res=[] 28 #链接 29 link_res = [] 30 for dt in all_dt: 31 single_dt=str(dt) 32 single_soup=BeautifulSoup(single_dt,'lxml') 33 title=single_soup.find('a').text 34 #存标题 35 temp_res.append(title) 36 #存摘要 37 38 #存关键字 39 40 #存源链接 41 sourcelink=single_soup.find('a')['href'] 42 sourcelink="https://www.ecva.net/"+sourcelink 43 temp_res.append(sourcelink) 44 res.append(temp_res) 45 temp_res=[] 46 #爬取作者和pdf文件链接 47 all_dd=soup.find('div',id='content') 48 all_dd=all_dd.find_all('dd') 49 flag=0 50 author=[] 51 download=[] 52 pdfinfo=[] 53 for item in all_dd: 54 if(flag%2==0): 55 #保存作者 56 author.append(item) 57 else: 58 linktext=str(item) 59 linksoup=BeautifulSoup(linktext,'lxml') 60 link_res.append(linksoup.find_all('div',class_='link2')) 61 #解析download 和 pdfinfo 62 flag = flag + 1 63 """ 64 继续使用beautifulsoup 65 download_text 和 pdfinfo_text 66 存储author 67 "https://www.ecva.net/" 68 """ 69 linkflag=1 70 print(len(link_res)) 71 for items in link_res: 72 for item in items: 73 if(linkflag%2==0): 74 pdfinfo_text = str(item) 75 else: 76 download_text = str(item) 77 linkflag=linkflag+1 78 download_text_soup=BeautifulSoup(download_text,'lxml') 79 pdfinfo_text_soup=BeautifulSoup(pdfinfo_text,'lxml') 80 #解析两个链接 81 download.append("https://www.ecva.net/"+download_text_soup.find('a')['href']) 82 pdfinfo.append(pdfinfo_text_soup.find('a')['href']) 83 print(len(download)) 84 print(len(pdfinfo)) 85 # for item in download : 86 # print(item) 87 print("------------------------------") 88 89 #把作者和download pdfinfo 存到res 90 for i in range(0,len(res)): 91 #添加作者 92 res[i].append(author[0]) 93 #添加download 94 res[i].append(download[0]) 95 #添加pdfinfo 96 res[i].append(pdfinfo[0]) 97 #遍历最终结果集 98 print(res[0]) 99 # for item in res: 100 # print(item) 101 return 102 103 if (__name__=='__main__'): 104 get_paper()