本文主要是介绍Python数据挖掘实战11:爬取豆瓣电影top250的电影数据,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
代码实例
# coding:utf8
# 1. 调用相关库
import requests, re
from bs4 import BeautifulSoup
from openpyxl import Workbook
# 2. 创建数据录入表格并添加表标题
wb = Workbook()
dest_filename = '豆瓣电影top250.xlsx'
ws1 = wb.active
ws1.title = "豆瓣电影top250"
# 3. 获取网页中的内容
douban_url = 'http://movie.douban.com/top250/'
def download_page(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}
data = requests.get(url, headers=headers).content
return data
# 4. 电影数据信息分类、清洗
def get_li(doc):
soup = BeautifulSoup(doc, 'html.parser')
ol = soup.find('ol', class_='grid_view')
name = [] # 名字
star_con = [] # 评价人数
score = [] # 评分
info_list = [] # 短评
for i in ol.find_all('li'):
detail = i.find('div', attrs={'class': 'hd'})
movie_name = detail.find('span', attrs={'class': 'title'}).get_text() # 电影名字
level_star = i.find('span', attrs={'class': 'rating_num'}).get_text() # 评分
star = i.find('div', attrs={'class': 'star'})
star_num = star.find(text=re.compile('评价')) # 评价
info = i.find('span', attrs={'class': 'inq'}) # 短评
if info: # 判断是否有短评
info_list.append(info.get_text())
else:
info_list.append('无')
score.append(level_star)
name.append(movie_name)
star_con.append(star_num)
page = soup.find('span', attrs={'class': 'next'}).find('a') # 获取下一页
if page:
return name, star_con, score, info_list, douban_url + page['href']
return name, star_con, score, info_list, None
# 5. 将电影数据录入表格
def main():
url = douban_url
name = []
star_con = []
score = []
info = []
while url:
doc = download_page(url)
movie, star, level_num, info_list, url = get_li(doc)
name = name + movie
star_con = star_con + star
score = score + level_num
info = info + info_list
for (i, m, o, p) in zip(name, star_con, score, info):
col_A = 'A%s' % (name.index(i) + 1)
col_B = 'B%s' % (name.index(i) + 1)
col_C = 'C%s' % (name.index(i) + 1)
col_D = 'D%s' % (name.index(i) + 1)
ws1[col_A] = i
ws1[col_B] = m
ws1[col_C] = o
ws1[col_D] = p
wb.save(filename=dest_filename)
# 6. 调用main()方法执行程序
if __name__ == '__main__':
main()
运行结果
这篇关于Python数据挖掘实战11:爬取豆瓣电影top250的电影数据的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!