对于python+selenium抓取网页新闻列表元素,保存到excel文档中_huan_128的专栏-CSDN博客
进行的优化
注意:本人出于学习,自己所瞎写,如注重写编码规则者,请忽略
#!/usr/local/bin/python3 # !python3.9 # -*- coding: UTF-8 -*- ''' @File:xwzx1.py @Time:2021-09-09 18:32 @Author: 抓取网页新闻列表元素,并记录标题和链接地址,保存到excel文档中 ''' from selenium import webdriver from os.path import join import os import sys from time import strftime from time import sleep import xlrd import xlwt excel_dir=os.path.dirname(os.path.abspath(__file__)) chromdriver=join(os.path.split(excel_dir)[0],'drivers','chromedriver') now_time=strftime("%Y-%m-%d-%H-%M") class text_xwzx(): def __init__(self): self.url="http://www.sinopecgroup.com/group/xwzx/" self.filename=now_time+".xls" self.path=join(excel_dir,'data/') #获取新闻列表的标题和链接,同时写入excel中 def getUTitle(self,driver,excel,row=0,col=0,rowlen=0): # 先找到ul->li,再找li->span.title->a,根据标签<a>属性,得到title和url ul = driver.find_element_by_css_selector("ul.w_newslistpage_list") atitles = ul.find_elements_by_css_selector("li>span.title>a") for title in atitles: atitle = title.text aurl = title.get_attribute("href") #当有分页时,先取得第一页的列表数量,作为第二页的开始,依次小于当前页列表的总数量 #思路:1)当为首页时,row从0到列表总数;若为第二/三/四页时,把首页列表总数量,做为开始数,遍历当前页列表的总数; # 写入excel # 参数对应 row-行, col-列, 值 excel.write(row, col, atitle) excel.write(row, col + 1, aurl) if(rowlen >= len(atitles)): row += 1 else: row += 1 return row #根据分页统计存放excel里的数据,以行数为基准 #modular:新闻模块;sh:excel模块方法;indexnum:查看多少页数据;rowlen:excel表中row的指定行数; def nextPage(self,driver,modular,sh,xwzx,indexnum,rowlen): ''' 根据分页属性,进行分页抓取数据 分页特点为default_XXX.shtml,default.shtml为首页 先抓取,总分页数; ''' pagingIndex=driver.find_element_by_css_selector('#pagingIndex>span>b').text index = str(pagingIndex).split("/")[1] num=(int(index)) a=0 while a < indexnum: if(num == (int(index))): nurl=self.url + modular + '/default.shtml' else: nurl =self.url + modular + '/default'+'_'+str(num)+'.shtml' print(nurl) driver.get(nurl) sleep(10) if(rowlen == 0): rowlen = xwzx.getUTitle(driver, sh) elif(rowlen >= 40): rowlen = xwzx.getUTitle(driver, sh,rowlen,0,rowlen) else: print("rowlen既不小于0又不大于40,等于为",rowlen) a +=1 num = num-1 #读取新闻列表数据 #modular:新闻模块;indexnum:查看多少页数据;rowlen:excel表中row的指定行数; def getData(self,modular,indexnum,rowlen=0): driver = webdriver.Chrome(chromdriver) driver.get(self.url+modular+'/default.shtml') sleep(10) # 使用xlwt模块操作Excel的方法 book = xlwt.Workbook(encoding='utf-8') # 先创建一个worksheet sh = book.add_sheet(modular) xwzx = text_xwzx() xwzx.nextPage(driver,modular,sh,xwzx,indexnum,rowlen) path_xls = self.path+self.filename # print(path_xls) #保存文件,文件给文件一个名字 book.save(path_xls) if __name__ == '__main__': data=text_xwzx() data.getData('dqgz',5)