一.python爬虫简介
1.什么是爬虫:
网络爬虫,是一种按照一定规则,自动抓取互联网信息的程序或者脚本。由于互联网数据的多样性和资源的有限性,根据用户需求定向抓取相关网页并分析已成为如今主流的爬取策略。
2.爬虫的作用:
网络抓取图片,爬取想看的视频,只要通过浏览器访问的数据都可以通过爬虫获取
3.爬虫的本质:
模拟浏览器打开网页,获取网页中我们想要的那部分数据
二.爬取数据
1.urllib模块使用
import urllib.request import urllib.parse #解析baidu网页源码并进行utf-8解码,get请求 response = urllib.request.urlopen( "http://www.baidu.com" ) print(response.read().decode("utf-8")) #获取一个post请求,其中封装data数据,使用utf8解码 data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8") response = urllib.request.urlopen("http://httpbin.org/post",data=data) print(response.read().decode("utf-8")) #超时处理 try: response = urllib.request.urlopen("http://httpbin.org/get",timeout=1) print(response.read().decode("utf-8")) except urllib.error.URLError as e: print("time out") #获取响应码/头部 response = urllib.request.urlopen( "http://www.baidu.com" ) print(response.status) print(response.getheaders()) #爬取豆瓣信息,使用浏览器信息 url = "http://www.douban.com" headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" } #data = bytes(urllib.parse.urlencode({"name":"eric"}),encoding="utf-8") req = urllib.request.Request(url=url,headers=headers,method="POST") response = urllib.request.urlopen(req) print(response.read().decode("utf-8"))
2.实例-数据获取
#得到指定一个URL的网页内容 def askURl(url): head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"} request = urllib.request.Request(url,headers=head) try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) return html
三.解析数据
1.BeauifulSoup模块
#!/usr/bin/python3 # @DESC:BeatuifulSoup4将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以回归为4种:Tag,NavigableString,BeautifulSoup,Comment import re from bs4 import BeautifulSoup file = open("./baidu.html","rb") html = file.read().decode("utf-8") bs = BeautifulSoup(html,"html.parser") #1.Tag标签及其内容,拿到他找到的第一个内容 print(bs.title) #打印title print(bs.a) #打印a标签 print(bs.head) #打印head标签 # 2.NavigableString拿到标签的内容 print(bs.title.string) #打印title中字符串 print(bs.a.attrs) #打印标签内所有属性 print(bs.a.string) #打印标签内字符串 # 3.BeautifulSoup,表示整个文档 print(bs.name) print(bs.attrs) print(bs) # 4.comment,是一个特殊的NavigableString,输出内容不包含注释 # 5.文档的遍历 print(bs.head.contents) print(bs.head.contents[0]) # 6文档搜索 # 6.1 find_all() 字符串过滤:会查找于字符串你完全匹配的内容 t_list = bs.find_all("a") #查找所有的a标签 print(t_list) # 6.2正则表达式搜索:使用search()方法来匹配内容 t_list = bs.find_all(re.compile("a")) print(t_list) # 6.3根据函数的要求来搜索 def name_is_exists(tag): return tag.has_attr("name") t_list = bs.find_all(name_is_exists); for item in t_list: print(item) # 6.3.kwargs 参数 t_list = bs.find_all(id="head",name=True,limit=3) #t_list = bs.find_all(text="贴吧") for item in t_list: print(item) # 6.4选择器 t_list = bs.select('title') #通过标签查找 t_list = bs.select('#u1') #通过id查找 t_list = bs.select(".mnav") #通过类名查找 t_list = bs.select("a[class]") #通过类名查找 t_list = bs.select(".mnav ~ .bri") #查看兄弟节点 for item in t_list: print(item)
2.re模块
import re #创建模式对象-search pat = re.compile("AA") #此处的AA是正则表达式,用来验证其他字符串 m = pat.search("CBA") #search字符串为被校验的内容 m = pat.search("ABCAA") #search字符串为被校验的内容 m = pat.search("BAACABCAA") #search字符串为被校验的内容 print(m) #打印返回第一次匹配的字符串中下标,左闭右开 #无模式对象-search m = re.search("asd","Aasd") #前字符串为模板,后字符串为被校验的对象 #print(m) #查找所有符合标准的字符串,返回列表 print(re.findall("a","ASDaDEFGAa")) #前字符串为模板,后字符串为被校验的对象 print(re.findall("[A-Z]","ASDaDEFGAa")) #返回大写字母 print(re.findall("[A-Z]+","ASDaDEFGAa")) #符合的字母一次性输出 #匹配符合调整的内容(.*?) print(re.findall("AS(.*?)Aa","ASDaDEFGAa")) #sub 正则替换 print(re.sub("a","A","abcdcasd")) #找到a用A来替换,在第三个字符串中查找 print(re.sub("\n","","ab\ndca\nsd")) #去除换行 #建议在正则表达式中,被比较的字符串前面加上r,不用担心转义
3.实例-数据解析
#创建正则表达式对象,表示规则(字符串的模式) findLink = re.compile(r'<a href="(.*?)">') #影片链接匹配规则 findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #re.S忽略换行符,图片链接匹配规则 findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名 findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则 fingCommentNum = re.compile(r'<span>(\d*?)人评价</span>') #匹配评价人数 findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价 findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容 #爬取网页 def getData(baseurl): datalist = [] #2.逐一解析数据 for i in range(10): #调用获取页面信息的函数10次 url = baseurl + str(i*25) html = askURl(url) #2.解析数据 soup = BeautifulSoup(html,"html.parser") for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表 #print(item) #测试查看电影item全部信息 data = [] #保存一部电影的所有信息 item=str(item) # re库用来通过正则表达式查找指定的字符串 link=re.findall(findLink,item)[0] #查找超链接 data.append(link) imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址 data.append(imgSrc) titles=re.findall(findTitle,item) #查找标题,可能多个 for i in range(0,3): res = titles[i].replace("/","").replace(" ","").replace("\xa0","") #去掉无关符号 data.append(res) rating = re.findall(findRating, item)[0] #查找评分 data.append(rating) commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量 data.append(commentNum) inq = re.findall(findInq, item) #查找一句话评论 if len(inq) !=0: inq = inq[0].replace(".","").replace(" ","").replace("。","") #去掉无关符号 data.append(inq) else: data.append("") bd = re.findall(findBD, item)[0] #查找相关内容 bd = re.sub('<br(\s+)?/>(\s+)?>',"",bd) #去掉<br/> bd = re.sub('/',"",bd) bd = re.sub('\xa0',"",bd) bd = re.sub(' ',"",bd) data.append(bd.strip()) datalist.append(data) #把处理好的一部电影信息放入datalist #print(datalist) return datalist
四.保存数据
1.xlwt模块
import xlwt workbook = xlwt.Workbook(encoding="utf-8") #创建workbook对象 worksheet = workbook.add_sheet('sheet1') #创建工作表 worksheet.write(0,0,'hello') #写入数据,第一行参数为行,第二行参数为列,第三行参数内容 workbook.save('student.xls') #保存数据表
2.sqlite3模块
import sqlite3 #1.打开或创建数据库文件 conn = sqlite3.connect("test.db") #安装插件Database Navigator后重启pycharm即可 print("Opened database successfully") c = conn.cursor() #获取游标 #2.创建表 sql_creatTabel = ''' create table if not exists company (id int promary key not null, name text not null, age int not null, address char(50), salary real); ''' c.execute(sql_creatTabel) #执行sql语句 conn.commit() #提交数据库操作 #conn.close() #关闭数据库连接 print("Creat table successfully") #3.插入数据 sql_insertData1 = ''' insert into company(id,name,age,address,salary) values(1,'张三',35,'南京',10000); ''' sql_insertData2 = ''' insert into company(id,name,age,address,salary) values(2,'李四',27,'北京',15000); ''' c.execute(sql_insertData1) c.execute(sql_insertData2) conn.commit() #提交数据库操作 print("Insert Data successfully") #4.查询数据 sql_queryData = ' select * from company ' cursor = c.execute(sql_queryData) for row in cursor: print("id=",row[0],end="") print("name=",row[1],end="") print("address=",row[2],end="") print("salary=",row[3],end="\n") print("Query Data successfully") conn.close()
3.实例-数据xls
#保存数据 def saveData(datalist,savepath): print("save......") book = xlwt.Workbook(encoding="utf8",style_compression=0) sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True) col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息") for i in range(9): sheet.write(0,i,col[i]) #列名 for i in range(250): print("第%d条"%(i+1)) data = datalist[i] for j in range(0,9): sheet.write(i+1,j,data[j]) book.save(savepath) #保存
4.实例-数据保存DB
#数据库初始化 def init_db(dbpath): sql = ''' create table if not exists movie250( id integer primary key autoincrement, info_link text, pic_link text, name1 varchar, name2 varchar, name3 varchar, score numeric, rated numeric, instroduction text, info text ) ''' #创建数据表 conn = sqlite3.connect(dbpath) cursor = conn.cursor() cursor.execute(sql) conn.commit() conn.close() #保存数据入DB def saveData2DB(datalist, dbpath): init_db(dbpath) conn=sqlite3.connect(dbpath) cur = conn.cursor() for data in datalist: for index in range(len(data)): if index ==5 or index ==6: continue data[index] = '"'+data[index]+'"' sql = ''' insert into movie250( info_link,pic_link,name1,name2,name3,score,rated,instroduction,info) values(%s)'''%",".join(data) #print(sql) cur.execute(sql) conn.commit() cur.close() conn.close()
五.完整源码
#!/usr/bin/python3 # -*- coding:utf-8 -*- # @Time:2021/8/21 11:43 # @author: Mrwhite # @File:spiderdouban250.py # @DESC: from bs4 import BeautifulSoup #网页解析,获取数据 import re #正则表达式 进行文字匹配 import urllib.request,urllib.error #制定URL,获取网页数据 import xlwt #进行excel操作 import sqlite3 #进行数据库操作 def main(): #xx电影250基础url baseurl = "https://movie.xxxxxx.com/top250?start=" #1-2.爬取网页并解析 datalist=getData(baseurl) savepath = "xx电影Top250.xls" dbpath = "movie.db" #3.保存数据 #saveData(datalist,savepath) saveData2DB(datalist,dbpath) #创建正则表达式对象,表示规则(字符串的模式) findLink = re.compile(r'<a href="(.*?)">') #影片链接匹配规则 findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #re.S忽略换行符,图片链接匹配规则 findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名 findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则 fingCommentNum = re.compile(r'<span>(\d*?)人评价</span>') #匹配评价人数 findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价 findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容 #爬取网页 def getData(baseurl): datalist = [] #2.逐一解析数据 for i in range(10): #调用获取页面信息的函数10次 url = baseurl + str(i*25) html = askURl(url) #2.解析数据 soup = BeautifulSoup(html,"html.parser") for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表 #print(item) #测试查看电影item全部信息 data = [] #保存一部电影的所有信息 item=str(item) # re库用来通过正则表达式查找指定的字符串 link=re.findall(findLink,item)[0] #查找超链接 data.append(link) imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址 data.append(imgSrc) titles=re.findall(findTitle,item) #查找标题,可能多个 for i in range(0,3): res = titles[i].replace("/","").replace(" ","").replace("\xa0","") #去掉无关符号 data.append(res) rating = re.findall(findRating, item)[0] #查找评分 data.append(rating) commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量 data.append(commentNum) inq = re.findall(findInq, item) #查找一句话评论 if len(inq) !=0: inq = inq[0].replace(".","").replace(" ","").replace("。","") #去掉无关符号 data.append(inq) else: data.append("") bd = re.findall(findBD, item)[0] #查找相关内容 bd = re.sub('<br(\s+)?/>(\s+)?>',"",bd) #去掉<br/> bd = re.sub('/',"",bd) bd = re.sub('\xa0',"",bd) bd = re.sub(' ',"",bd) data.append(bd.strip()) datalist.append(data) #把处理好的一部电影信息放入datalist #print(datalist) return datalist #得到指定一个URL的网页内容 def askURl(url): head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"} request = urllib.request.Request(url,headers=head) try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) return html #保存数据 def saveData(datalist,savepath): print("save......") book = xlwt.Workbook(encoding="utf8",style_compression=0) sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True) col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息") for i in range(9): sheet.write(0,i,col[i]) #列名 for i in range(250): print("第%d条"%(i+1)) data = datalist[i] for j in range(0,9): sheet.write(i+1,j,data[j]) book.save(savepath) #保存 #数据库初始化 def init_db(dbpath): sql = ''' create table if not exists movie250( id integer primary key autoincrement, info_link text, pic_link text, name1 varchar, name2 varchar, name3 varchar, score numeric, rated numeric, instroduction text, info text ) ''' #创建数据表 conn = sqlite3.connect(dbpath) cursor = conn.cursor() cursor.execute(sql) conn.commit() conn.close() #保存数据入DB def saveData2DB(datalist, dbpath): init_db(dbpath) conn=sqlite3.connect(dbpath) cur = conn.cursor() for data in datalist: for index in range(len(data)): if index ==5 or index ==6: continue data[index] = '"'+data[index]+'"' sql = ''' insert into movie250( info_link,pic_link,name1,name2,name3,score,rated,instroduction,info) values(%s)'''%",".join(data) #print(sql) cur.execute(sql) conn.commit() cur.close() conn.close() if __name__ == "__main__": #当程序执行时 #调用函数 main() print("爬取完毕")