4.3.2 Selenium的实践案例
获取第一条评论数据
1 from selenium import webdriver 2 driver = webdriver.Chrome(executable_path = r'D:\Anaconda\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe') 3 driver.get("http://www.santostang.com/2018/07/04/hello-world/") 4 5 driver.switch_to.frame(driver.find_element_by_css_selector('iframe[title="livere-comment"]')) 6 driver.implicitly_wait(20) 7 comment = driver.find_element_by_css_selector('div.reply-content') 8 content = comment.find_element_by_tag_name('p') 9 print(content.text)
4.3.3 Selenium 获取某一页的所有评论
1 from selenium import webdriver 2 driver = webdriver.Chrome(executable_path = r'D:\Anaconda\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe') 3 driver.get("http://www.santostang.com/2018/07/04/hello-world/") 4 5 #driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") 6 driver.switch_to.frame(driver.find_element_by_css_selector('iframe[title="livere-comment"]')) 7 driver.implicitly_wait(30) 8 comments = driver.find_elements_by_css_selector('div.reply-content') 9 #注意,这里要将element改为elements;此时comments为list类型 10 #content = comment.find_element_by_tag_name('p') 11 #print(content.text) 12 for item in comments: 13 comment = item.find_element_by_tag_name('p') 14 print(comment.text)
driver.find_elements_by_css_selector得到一个列表,注意这里是elements,为了得到当前页面所有的elements。
获取所有网页的所有评论:
代码中没有使用“下滑到页面底部”这个操作,即没有:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
但这并不影响网页内容的爬取。
由于原书中有这行代码,在解析iframe之后,这行代码就用不了了,所以要使用driver.switch_to.default_content()转回为本来的未解析的iframe。
但是下面代码中没有这个下滑操作,所以同时转回iframe的代码也不需要了。
1 from selenium import webdriver 2 import time 3 4 driver = webdriver.Chrome(executable_path = r'D:\Anaconda\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe') 5 driver.get("http://www.santostang.com/2018/07/04/hello-world/") 6 7 j=0 8 driver.switch_to.frame(driver.find_element_by_css_selector('iframe[title="livere-comment"]')) 9 driver.implicitly_wait(30) 10 while True: 11 12 #buttons为list类型,获取一屏中页面数量 13 buttons = driver.find_elements_by_css_selector('button.page-btn') 14 bSize = len(buttons) 15 16 for i in range(0,bSize): 17 #打开文件 18 file = open(r'C:\Users\rumin\Desktop\comment.txt','a+') 19 file.write('\n') 20 file.write('=====================第%d页评论========================\n'%(i+1+j*10)) 21 22 #找到当前页所有comments 23 comments = driver.find_elements_by_css_selector('div.reply-content') 24 for item in comments: 25 comment = item.find_element_by_tag_name('p') 26 print(comment.text) 27 text = comment.text.encode('GBK', 'ignore').decode('GBk') 28 file.write(text+'\n') 29 time.sleep(2) 30 file.close() 31 32 if i<bSize-1: 33 #这里需要重新获取一下buttons列表。否则,可能会报“stale element reference”,即陈旧元素内容 34 buttons = driver.find_elements_by_css_selector('button.page-btn') 35 buttons[i+1].click() #bottons[1]~buttons[bSize-1],分别对应第2页~第bSize页 36 driver.implicitly_wait(30) 37 38 try: 39 next_page = driver.find_element_by_class_name("page-last-btn") 40 next_page.click() 41 driver.implicitly_wait(30) 42 j+=1 43 except: 44 print() 45 print("爬取结束!") 46 break