目标网站 https://movie.douban.com/top250?start=
调用urllib库
URL(Uniform Resource Locator ):统一资源定位符,是网页存放文件的地址
head:请求头,告诉你要访问的网站请求是谁发出的,也就是我们的设备信息
Google浏览器->快捷键F12(开发者模式)
请求头内容在下图蓝色方框所示位置
import urllib.request, urllib.error # 指定URL,获取网页数据 def askURL(url): head = {" "} #请填入上图中蓝框内代码(user agent信息) request = urllib.request.Request(url, headers=head) html = "" # 异常处理 try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") # print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html
- 调用BeautifulSoup库和re库
- BeautifulSoup:“靓汤”,把我们爬取的信息资源熬成靓汤
- re(regular expression):正则表达式,能够根据关键词或句式,对文本信息进行搜索匹配
from bs4 import BeautifulSoup # 网页解析, 获取数据(拆分) import re # 正则表达式,进行文字匹配(提炼) #获取数据 def getData(baseurl): datalist = [] for i in range(0,10): url = baseurl + str(i*25) html = askURL(url) #保存网页源码 # 逐一解析数据 soup = BeautifulSoup(html,"html.parser") for item in soup.find_all('div',class_="item"): # 查找符合要求字符串,形成列表 data = [] item = str(item) link = re.findall(findLink,item)[0] # re库通过正则表达式查找指定字符串 data.append(link) # 添加链接 ImgSrc = re.findall(findImgSrc,item)[0] # 添加图片 data.append(ImgSrc) titles = re.findall(findTitle, item) # 添加片名 if(len(titles)==2): ctitle = titles[0] otitle = titles[1].replace("/","") otitle = re.sub(r'\xa0',"",otitle) # re.sub替换指定字符 data.append(ctitle) data.append(otitle) else: ctitle = titles[0] data.append(ctitle) data.append(' ') rating = re.findall(findRating,item)[0] # 添加评分 data.append(rating) judgeNum = re.findall(findJudge,item)[0] # 添加评论人数 data.append(judgeNum) inq = re.findall(findInq,item) # 添加简介 if len(inq) != 0: inq = inq[0].replace("。","") data.append(inq) else: data.append(" ") bd = re.findall(findBd,item)[0] bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd) bd = re.sub('/'," ",bd) bd = re.sub(r'\xa0', "", bd) data.append(bd.strip()) # strip()去头尾 datalist.append(data) # print(datalist) return datalist
- 调出获取到的html文件,根据文本特点,设计正则表达式
# html文件 <div class="item"> <div class="pic"> <em class="">1</em> <a href="https://movie.douban.com/subject/1292052/"> <img alt="肖申克的救赎" class="" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg" width="100"/> </a> </div> <div class="info"> <div class="hd"> <a class="" href="https://movie.douban.com/subject/1292052/"> <span class="title">肖申克的救赎</span> <span class="title"> / The Shawshank Redemption</span> <span class="other"> / 月黑高飞(港) / 刺激1995(台)</span> </a> <span class="playable">[可播放]</span> </div> <div class="bd"> <p class=""> 导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br/> 1994 / 美国 / 犯罪 剧情 </p> <div class="star"> <span class="rating5-t"></span> <span class="rating_num" property="v:average">9.7</span> <span content="10.0" property="v:best"></span> <span>2591411人评价</span> </div> <p class="quote"> <span class="inq">希望让人自由。</span> </p> </div> </div> </div># 正则表达式 findLink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象,表示规则 findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # .一个 *多个 ?贪婪(搜索到第一个就停止) findTitle = re.compile(r'<span class="title">(.*)</span>') findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') findJudge = re.compile(r'<span>(\d*)人评价</span>') findInq = re.compile(r'<span class="inq">(.*)</span>') findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # re.S 换行不重新匹配 字符串整体进行匹配
调用sqlite3数据库
import sqlite3 # SQLite数据库操作 # 创建数据库 def init_db(dbpath): sql = ''' # 创建数据表 create table if not exists movie250 ( id integer primary key autoincrement, info_link text, pic_link text, cname varchar, ename varchar, score numeric, rated numeric, instroduction text, info text ) ''' conn = sqlite3.connect(dbpath) # 打开或创建数据库文件 cursor = conn.cursor() # 获取游标 cursor.execute(sql) # 执行sql语句 conn.commit() # 提交数据库操作 conn.close() # 关闭数据库连接
# 保存数据 def saveData2DB(datalist,dbpath): init_db(dbpath) conn = sqlite3.connect(dbpath) cur = conn.cursor() for data in datalist: for index in range(len(data)): # data[index] = '"' + str(data[index]) + '"' if index == 4 or index == 5: continue data[index] = '"'+data[index]+'"' sql = ''' insert into movie250( info_link,pic_link,cname,ename,score,rated,instroduction,info) values(%s)'''%",".join(data) # 逗号连接 print(sql) cur.execute(sql) conn.commit() cur.close() conn.close()
# -*- coding = utf-8 -*- # @Time : 2022-04-06 上午 12:15 # @Author : SYSUer # @File : crawler.py # @Software : PyCharm # 学习参考 https://www.bilibili.com/video/BV12E411A7ZQ?p=15 from bs4 import BeautifulSoup # 网页解析, 获取数据(拆分) import re # 正则表达式,进行文字匹配(提炼) import urllib.request, urllib.error # 指定URL,获取网页数据 import sqlite3 # SQLite数据库操作 def main(): baseurl = "https://movie.douban.com/top250?start=" # 1.爬取网页 datalist = getData(baseurl) # savepath = ".\\豆瓣电影Top250.xls" dbpath = "movie.db" # 3.保存数据 # saveData(savepath) saveData2DB(datalist,dbpath) findLink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象,表示规则 findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # .一个 *多个 ?贪婪(搜索到第一个就停止) findTitle = re.compile(r'<span class="title">(.*)</span>') findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') findJudge = re.compile(r'<span>(\d*)人评价</span>') findInq = re.compile(r'<span class="inq">(.*)</span>') findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # re.S 换行不重新匹配 字符串整体进行匹配 # 爬取网页 def getData(baseurl): datalist = [] for i in range(0,10): url = baseurl + str(i*25) html = askURL(url) #保存网页源码 # 逐一解析数据 soup = BeautifulSoup(html,"html.parser") for item in soup.find_all('div',class_="item"): # 查找符合要求字符串,形成列表 data = [] item = str(item) link = re.findall(findLink,item)[0] # re库通过正则表达式查找指定字符串 data.append(link) # 添加链接 ImgSrc = re.findall(findImgSrc,item)[0] # 添加图片 data.append(ImgSrc) titles = re.findall(findTitle, item) # 添加片名 if(len(titles)==2): ctitle = titles[0] otitle = titles[1].replace("/","") otitle = re.sub(r'\xa0',"",otitle) # re.sub替换指定字符 data.append(ctitle) data.append(otitle) else: ctitle = titles[0] data.append(ctitle) data.append(' ') rating = re.findall(findRating,item)[0] # 添加评分 data.append(rating) judgeNum = re.findall(findJudge,item)[0] # 添加评论人数 data.append(judgeNum) inq = re.findall(findInq,item) # 添加简介 if len(inq) != 0: inq = inq[0].replace("。","") data.append(inq) else: data.append(" ") bd = re.findall(findBd,item)[0] bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd) bd = re.sub('/'," ",bd) bd = re.sub(r'\xa0', "", bd) data.append(bd.strip()) # strip()去头尾 datalist.append(data) # print(datalist) return datalist def askURL(url): head = {" "} #请填入user agent信息 request = urllib.request.Request(url, headers=head) html = "" # 异常处理 try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") # print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html # 保存数据 def saveData2DB(datalist,dbpath): init_db(dbpath) conn = sqlite3.connect(dbpath) cur = conn.cursor() for data in datalist: for index in range(len(data)): # data[index] = '"' + str(data[index]) + '"' if index == 4 or index == 5: continue data[index] = '"'+data[index]+'"' sql = ''' insert into movie250( info_link,pic_link,cname,ename,score,rated,instroduction,info) values(%s)'''%",".join(data) # 逗号连接 print(sql) cur.execute(sql) conn.commit() cur.close() conn.close() # 创建数据表 def init_db(dbpath): sql = ''' create table if not exists movie250 ( id integer primary key autoincrement, info_link text, pic_link text, cname varchar, ename varchar, score numeric, rated numeric, instroduction text, info text ) ''' conn = sqlite3.connect(dbpath) # 打开或创建数据库文件 cursor = conn.cursor() # 获取游标 cursor.execute(sql) # 执行sql语句 conn.commit() # 提交数据库操作 conn.close() # 关闭数据库连接 if __name__ == '__main__': # 当程序执行时 # 调用函数 init_db("movietest.db") main()
Python爬虫编程基础5天速成(2021全新合集)Python入门+数据分析_哔哩哔哩_bilibili