本文主要是介绍python爬取豆瓣250,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
import urllib.request
import ssl
import re
import xlwt
import DBUtils
import xlrd
from xlutils.copy import copy
def getContent(ye):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" ,
'Connection': 'keep-alive'
}
url = "https://movie.douban.com/top250?start=%s&filter="%ye
##请求对象(url+请求头)
req = urllib.request.Request(url,headers = headers)
##获取页面内容
page = urllib.request.urlopen(req).read()
page = page.decode("utf-8")
return page
# print(getContent(ye))
def getItem(content):
pattern = re.compile(r'alt="(.*?)"')
res = re.findall(pattern,content)
res.pop()
return res;
#
# content = getContent(ye)
# print(getItem(content))
def saveExcel():
wb = xlwt.Workbook()
sheet = wb.add_sheet("豆瓣250")
header = ["书名"]
for (i,v) in enumerate(header):
sheet.write(0,i,v)
wb.save("豆瓣.xls")
# content = getContent()
# list = getItem(content)
# saveExcel(list)
def wb(list,x):
# 打开工作薄
wb = xlrd.open_workbook("豆瓣.xls")
# 复制一份工作薄,用来写入
copyWb = copy(wb)
# 通过索引获取表
sheet = copyWb.get_sheet(0)
for (i, v) in enumerate(list):
sheet.write(x,0, v)
x +=1
# 保存,如果文件名和之前一样,覆盖
# 文件名不存在:新的文件
copyWb.save("豆瓣.xls")
def ye():
ye = 0
x = 1
saveExcel()
while ye<250:
content = getContent(ye)
list = getItem(content)
wb(list,x)
for i in range(0,len(list)):
sql = "insert into tb_use(name) values ('%s');"%list[i]
DBUtils.insertData(sql)
ye +=25
x +=25
return "完成"
print(ye())
import pymysql.cursors
#获取连接
def getConnect():
conn = pymysql.connect(host="", user="root", password="123", database="pymysql", charset="utf8")
return conn
#关闭连接
def closeConnect(cursor,conn):
if cursor:
cursor.close()
if conn:
conn.close()
#插入数据
def insertData(sql):
conn = getConnect()
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
closeConnect(cursor, conn)
count = cursor.rowcount
if count > 0:
return True
else:
return False
这篇关于python爬取豆瓣250的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!