本文介绍基于最简单的操作流程保存Python网络爬虫抓取的数据,通过2个完整的代码示例来介绍爬虫抓取的数据是如何保存到数据仓库中。
#coding=utf-8 import urllib import urllib.request import pymysql import time import requests import datetime import pandas as pd from bs4 import BeautifulSoup mysql_engine = { "host":"localhost", "database":"sampledb", "user":"root", "password":"admin", "charset":"utf8"} #创建数据库 def creat_database_sampledb(): config_root = { "host": "localhost", "user": "root", "password": "admin"} sql = "Create Database If Not Exists sampledb CHARSET=utf8" conn = pymysql.connect(**config_root) # 打开数据库连接 try: with conn.cursor() as cursor: # 使用cursor()方法获取操作游标,并在语句结束自动关闭 cursor.execute(sql) # 执行SQL conn.commit() # 提交 finally: conn.close() #创建新闻列表 def createNewsTable(): createTbSql = ( "Create Table If Not Exists News( " "id int primary key auto_increment, " "title varchar(100), " "url varchar(100), " "date date)") try: corsor = pymysql.connect(**mysql_engine) with corsor.cursor() as cursor: cursor.execute(createTbSql) corsor.commit() finally: corsor.close() return None #创建新闻内容表 def createNewsBody(): createTbNews = ( "Create Table If Not Exists NewsBody( " "id INT PRIMARY KEY, " "text text, " "FOREIGN KEY(id) REFERENCES News(id))") try: corsor = pymysql.connect(**mysql_engine) with corsor.cursor() as cursor: cursor.execute(createTbNews) corsor.commit() finally: corsor.close() return None #开始爬取数据 def start_crawler(): page_num = 1 while page_num<=2: url = "http://www.cctd.com.cn/list-107-{}.html".format(page_num) print (url) page_num += 1 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } req = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(req) content = response.read().decode('gbk') one_page = get_page_news(content) time.sleep(1) if one_page: to_mysql(one_page) time.sleep(1) else: break print ('新闻抓取完毕') #写入数据库 def to_mysql(one_page): print (one_page) sql = "insert into News(id,title,url,date) values(Null,%s,%s,%s)" conn = pymysql.connect(**mysql_engine) try: with conn.cursor() as cursor: cursor.executemany(sql, one_page) conn.commit() finally: conn.close() def get_page_news(content): soup = BeautifulSoup(content,'lxml') one_page = [] lptable = soup.find('table',id='new_table') for i in lptable.find_all('tr'): if i.a['href'] in get_exist(i.find('td',width='20%').string): continue one_page.append( (i.a.string, i.a['href'], i.find('td',width='20%').string)) return one_page def get_exist(date): sql = "select url from News where date='{}'".format(date) conn = pymysql.connect(**mysql_engine) try: with conn.cursor() as cursor: cursor.execute(sql) conn.commit() result = cursor.fetchall() finally: conn.close() return set(*zip(*result)) #抓取具体内容 def get_new_body(): link_list = get_news_linksfrom_database() for id,url in link_list: news_body = get_news_text(url) #写入数据库 writer_news_body_to_database(id, news_body) print("新闻主体完毕!") def get_news_linksfrom_database(): pymysql_select_newslink = """ select News.id,News.url from News left join Newsbody on News.id = Newsbody.id where Newsbody.id is null; """ conn = pymysql.connect(**mysql_engine) try: with conn.cursor() as cursor: cursor.execute(pymysql_select_newslink) conn.commit() result = cursor.fetchall() finally: conn.close() return result if result else [] def get_news_text(url): html = requests.get(url) html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text,'html.parser') try: return soup.find('div',{'id':'Zoom'}).text except: return None def writer_news_body_to_database(id, news_body): print("INFO: Writing News ID:{} To Database...".format(id)) pymysql_writer_to_table = """ insert into Newsbody(id,text) values(%s,%s) """ conn = pymysql.connect(**mysql_engine) try: with conn.cursor() as cursor: cursor.execute(pymysql_writer_to_table,(id,news_body)) conn.commit() finally: conn.close() if __name__ == '__main__': creat_database_sampledb()#创建数据库 createNewsTable()#创建新闻表 createNewsBody()#创建新闻详情表 '''爬取新闻简要''' start_crawler() #开始爬虫 '''爬取具体新闻''' get_new_body()
#coding=utf-8 import urllib import urllib.request import time import requests import pandas as pd from bs4 import BeautifulSoup import pymongo # 获取mongoClient对象 client = pymongo.MongoClient("localhost", 27017) # 获取使用的database对象 db = client.news #开始爬取数据 def start_crawler(): page_num = 1 while page_num<=1: url = "http://www.cctd.com.cn/list-13-{}.html".format(page_num) print (url) page_num += 1 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } req = urllib.request.Request(url,headers=headers) response=requests.get(url,headers=headers) content=response.text one_page = get_page_news(content) time.sleep(1) if one_page: to_mysql(one_page) time.sleep(1) else: break print ('新闻标题抓取完毕') #爬取新闻标题、日期、地址到数据库 def to_mysql(one_page): print (one_page) def get_page_news(content): soup = BeautifulSoup(content,'lxml') one_page_list = [] lptable = soup.find('table',id='new_table') for i in lptable.find_all('tr'): title = i.a.string url = i.a['href'] date = i.find('td',width='20%').string one_page={'title':title,'url':url,'date':date,'type':'analysis','label':'www.cctd.com.cn'} db.news.insert_one(one_page) one_page_list.append((title,url,date)) return one_page_list #抓取具体内容 def get_new_body(): link_list = get_news_linksfrom_database() for url in link_list['url']: news_body = get_news_text(url) #写入数据库 db.news.update({'url':url},{"$set":{'newsbody':news_body}}) print("新闻主体爬取完毕!") def get_news_linksfrom_database(): result=[] for item in db.news.find({'label':'www.cctd.com.cn','type':'analysis'},{'url':1,'_id':1}): result.append(item) result = pd.DataFrame(result, columns=['url','_id']) return result def get_news_text(url): html = requests.get(url) html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text,'html.parser') try: newsBody=str(soup.find('div',{'id':'Zoom'})) print ("记录爬取完毕") return newsBody except: print ("error") return None if __name__ == '__main__': '''爬取新闻简要''' start_crawler() '''爬取具体新闻''' get_new_body()