自动创建数据库和数据库表,如果已存在就不会创建。
特别要注意创建数据库和创建表时,设置默认字符集为utf8。
python代码
import re import requests import os import sys import time import random from xpinyin import Pinyin import numpy as np import pymysql #定义数据库类 class class_database: def __init__(self, databaseName): self.databaseName=databaseName self.host='xxx.xxx.xxx.xxx' #这里填自己服务器的公网IP self.user='xxx' #这里填服务器上mysql数据库的用户名 self.password='****' #这里填服务器上mysql数据库的密码 self.port=3306 #数据库端口 self.createDB() #创建数据库 def createDB(self): mydb = pymysql.connect( host=self.host, user=self.user, passwd=self.password ) mycursor = mydb.cursor() mycursor.execute("CREATE DATABASE IF NOT EXISTS "+self.databaseName) #连接数据库 mysql def connectDB(self): #此处添加charset='utf8'是为了在数据库中显示中文,此编码必须与数据库的编码一致 db = pymysql.connect(host=self.host,user=self.user,password=self.password, db=self.databaseName,port=self.port,charset='utf8') return db #创建表,SQL语言。CREATE TABLE IF NOT EXISTS 表示:表不存在时就创建 def createTable(self, tableName, sql): DB_create=self.connectDB() cursor_create=DB_create.cursor() cursor_create.execute(sql) DB_create.close() print('creat table '+tableName+' successfully') #数据插入表中 def inserttable(self,tableName,sql,zone,count,tuple): #print('params:'+str(params[0])+',end') #print('params:'+str(params)+',end') DB_insert=self.connectDB() cursor_insert=DB_insert.cursor() t = [count,zone,tuple[0],tuple[1],tuple[2],tuple[3],int(tuple[4]),int(tuple[5]),tuple[6],tuple[7],tuple[8],tuple[9]] cursor_insert.execute(sql,t) #cursor_insert.execute(sql,(count,zone,tuple[0],tuple[1],tuple[2],tuple[3],int(tuple[4]),int(tuple[5]),tuple[6],tuple[7],tuple[8],tuple[9])) DB_insert.commit() DB_insert.close() print('inert contents to '+tableName+' successfully') #end class class_database #定义爬虫类 class douban_local_activity(object): def __init__(self, city, activity_tag_lists): self.division_list = [] self.city = city p = Pinyin() self.city_pinyin = p.get_pinyin(city, '') self.activity_tag_lists = activity_tag_lists #Some User Agents self.hds=[{'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'},\ {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}] self.cookies = {'Cookie':'bid=STBb9aHzI4Y; __yadk_uid=ZT5U34Z2UfJW17TSG3EJu33qSJqmh021; _vwo_uuid_v2=D47938034A35EBAE2398C927CAFC2B7E2|559ed3be181cf150f83a1c5e30a23c87; douban-fav-remind=1; gr_user_id=8bee0390-ec32-4d85-9f7d-f19acf2bba2c; ps=y; _ga=GA1.2.1764404011.1524626232; __utmc=30149280; __utmz=30149280.1537237849.21.13.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ct=y; viewed="1016567_6754416_1962350_1775691_30270830_27608239_27094706_6082808"; ap_v=0,6.0; loc-last-index-location-id="118282"; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1537345408%2C%22https%3A%2F%2Fbeijing.douban.com%2Fevents%2Ffuture-all-128519%3Fstart%3D20%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1764404011.1524626232.1537339546.1537345409.29; _gid=GA1.2.1200789924.1537345415; dbcl2="183532457:8zqxbVkIaFI"; ck=WAZC; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18353; ll="118159"; __utmb=30149280.6.10.1537345409; _pk_id.100001.8cb4=48e28a093264a847.1524626231.29.1537346192.1537342847.'} print('开始爬取内容') #获取单页链接html文本 def getSource(self,url): html = requests.get(url, headers=(self.hds[np.random.randint(0,len(self.hds))]), cookies=self.cookies) time.sleep(random.random()) html.encoding = 'utf-8' return html.text #获取单项链接html文本,返回“类型” def getDetail(self,url): html = requests.get(url, headers=(self.hds[np.random.randint(0,len(self.hds))]), cookies=self.cookies) time.sleep(random.random()) html.encoding = 'utf-8' source = html.text try: type_info = re.findall('<div class="event-detail">\s+<span class="pl">类型.*itemprop="eventType">(.*?)</a>',source,re.S)[0] except: type_info = '' return str(type_info) #抓取单页所有的活动信息 def getAllEvents(self,source): try: biggerHtml = re.search('<ul class="events-list(.*?)<div class="paginator">', source, re.S).group(1) events = re.findall('(<li class="list-entry".*?</p>\s+</div>\s+</li>)', biggerHtml, re.S) except: events=[] return events #获取总页数 def getTotalPage(self,source): try: total_page = re.findall('<span class="thispage" data-total-page="(.*?)">',source,re.S)[0] except: total_page = 1 return total_page #获取所有行政区域 def getAllDivision(self,source): div_list = [] try: biggerHtml = re.search('<div class="events-nav-item events-filter-loc">(.*?)<div class="events-nav-item">', source, re.S).group(1) all_division = re.findall('(<li><a href=".*?</a></li>)', biggerHtml, re.S) for item in all_division: d_name = re.search('<li><a href=".*">\s+(.*?)\s+</a></li>',item,re.S).group(1) d_url = re.search('<li><a href="(.*?)">',item,re.S).group(1) print('d_name:'+d_name+', d_url:'+d_url) if "全部" != d_name: div_list.append([d_name, d_url]) except: pass return div_list #获取每个活动的详细信息 def getEntity(self,event,division): entity_list =[] title = re.search('<span itemprop="summary">(.*?)</span>',event,re.S).group(1) time = re.search('时间:</span>\s+(.*?)\s+<time',event,re.S).group(1) position = re.search('<li title="(.*?)">\s+<span',event,re.S).group(1) fee = re.search('<strong>(.*?)</strong>',event,re.S).group(1) #发起方 owner = re.search('<a target="db-event-owner" href=".*">(.*?)</a>',event,re.S).group(1) #参加人数 感兴趣人数 区域 join = re.search('<p class="counts">\s+<span>(.*?)人参加</span>',event,re.S).group(1) interested = re.search('<span class="pipe"></span> <span>(.*?)人感兴趣</span>',event,re.S).group(1) #url detail_url = re.search('<div class="pic">\s+<a href="(.*?)">\s+<img data-lazy="',event,re.S).group(1) url = detail_url type_info = self.getDetail(detail_url) entity_list.append([title,fee,type_info,time,join,interested,division,position,owner,detail_url]) return entity_list #保存到服务器的mysql数据库中 def save2mysql(self,activity_lists,activity_tag_lists): db_name='douban_activity' table_name=self.city_pinyin #实例化class_database()对象,创建数据库 cls_db=class_database(db_name) sql_create_table = str('create table if not exists ')+str('%s' % table_name)+\ str('(count INT PRIMARY KEY NOT NULL,zone CHAR(32),title CHAR(128),fee CHAR(32),type_info CHAR(32),time CHAR(64),join_cnt INT,interested INT,division CHAR(16),position CHAR(64),owner CHAR(32),detail_url CHAR(64)) DEFAULT CHARSET=utf8;') #创建表 cls_db.createTable(table_name, sql_create_table) p = Pinyin() #插入数据 for i in range(len(activity_tag_lists)): for j in range(len(self.division_list)): count=1 div = self.division_list[j] for bl in activity_lists[j]: sql_insert = str('insert into ')+str('%s' % table_name)+\ str('(count,zone,title,fee,type_info,time,join_cnt,interested,division,position,owner,detail_url)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);') cls_db.inserttable(table_name, sql_insert, p.get_pinyin(div[0], ''),count,bl) count+=1 #主函数 def main(self): event_lists=[] #第一次请求url,获取行政区域的url,删除“全部”项 #url = 'https://shenzhen.douban.com/events/future-all' url = 'https://www.douban.com/location/'+self.city_pinyin+'/events/future-all' print('第一次请求url:' + url) html = self.getSource(url) self.division_list = self.getAllDivision(html) #分行政区域进行处理 for div in self.division_list: division_events = [] d_name = div[0] url = div[1] for tag in self.activity_tag_lists: #截取替换"类型",默认为“全部” tag_url=url tag_url = tag_url.split("-")[0]+str("-"+tag[1]+"-")+tag_url.split("-")[-1] #print('tag_url:%s'%tag_url) first = 1 total_page = 1 i=0 while i<total_page: link = tag_url+str('?start=%s' % (i*10)) print('正在处理页面:' + link) html = self.getSource(link) if 1 == first: total_page = (int)(self.getTotalPage(html)) print('总页数:%d'%(total_page)) first = 0 #保存数据 allEvents = self.getAllEvents(html) for item in allEvents: entity = self.getEntity(item,d_name) division_events.extend(entity) i=i+1 print('tag=%s, division_events len=%d'%(tag[0],len(division_events))) if len(division_events)>0: division_events=sorted(division_events,key=lambda x:x[4],reverse=True) event_lists.append(division_events) #保存到服务器的mysql数据库中 self.save2mysql(event_lists,self.activity_tag_lists) #end class douban_local_activity(object) #--------------------------------------------- if __name__ == '__main__': activity_tag_lists=[] activity_tag_lists.append(["全部", "all"]) ''' activity_tag_lists.append(["音乐", "music"]) activity_tag_lists.append(["戏剧", "drama"]) activity_tag_lists.append(["讲座", "salon"]) activity_tag_lists.append(["聚会", "party"]) activity_tag_lists.append(["电影", "film"]) activity_tag_lists.append(["展览", "exhibition"]) activity_tag_lists.append(["运动", "sports"]) activity_tag_lists.append(["公益", "commonweal"]) activity_tag_lists.append(["旅行", "travel"]) activity_tag_lists.append(["赛事", "competition"]) activity_tag_lists.append(["课程", "course"]) activity_tag_lists.append(["亲子", "kids"]) activity_tag_lists.append(["其他", "others"]) ''' city_lists = ['温州']#深圳 梧州 兰州 福州 武汉 南京 上海 温州 for city in city_lists: activity = douban_local_activity(city, activity_tag_lists) activity.main()
前提:已创建test数据库和dongman表结构
python代码
# -*- coding: utf-8 -*- import os,sys import requests import bs4 import pymysql#import MySQLdb #-------------- #前提:已创建test数据库和dongman表结构 #-------------- #连接MYSQL数据库 db = pymysql.connect(host='x',user='rootxx.xxx.xxx.xxx',password='root***',db='test',port=3306,charset='utf8') #db = MySQLdb.connect('127.0.0.1','root','mysql','test',coon.set_character_set('utf8')) print('连接数据库成功!') conn = db.cursor() #获取指针以操作数据库 conn.execute('set names utf8') html = 'https://www.dongmanmanhua.cn/dailySchedule?weekday=MONDAY' result = requests.get(html) texts = result.text data = bs4.BeautifulSoup(texts,'html.parser'); lidata = data.select('div#dailyList ul.daily_card li') #print(lidata) arr = {} for x in lidata: did = x.get('data-title-no') print(did) name = x.select('p.subj') name1 = name[0].get_text() url = x.a.get('href') #print(url) story = x.a.p story1 = story.string user = x.select('p.author') user1 = user[0].get_text() like = x.select('em.grade_num') like1 = like[0].get_text() #写入MYSQL数据库 t = [did,name1,url,story1,user1,like1] sql = u"INSERT INTO dongman(did,name,url,story,user,likes) VALUES(%s,%s,%s,%s,%s,%s)" conn.execute(sql,t) # t1 = (did,name1,url,story1,user1,like1) # sql1 = u'''insert into dongman(did,name,url,story,user,likes) values (%d,'%s','%s','%s','%s','%s')''' % t1 # conn.execute(sql1) db.commit() #提交操作 print('插入数据成功!') #关闭MySQL连接 conn.close() db.close()
报错问题解决方法
1.pymysql.err.DataError: (1366, “Incorrect string value: ‘\\xE7\\x9A\\x84\\xE5\\x8F\\x91...‘ for colum
对数据库和表进行字符集编辑
2.pymysql.err.IntegrityError: (1062, “Duplicate entry ‘ ‘ for key ‘PRIMARY‘“)
主键重复导致,可以修改数据库表不设置主键。
通过navicat for mysql客户端查看成功写入数据库表中的数据: