import re import urllib from urllib import request from collections import deque from bs4 import BeautifulSoup import lxml import sqlite3 import jieba import math conn=sqlite3.connect("viewsdu.db") c=conn.cursor() c.execute('select count(*) from doc') N=1+c.fetchall()[0][0] target=input("请输入搜索词") seggen=jieba.cut_for_search(target) #搜索内容进行分词 score={} for word in seggen: print('得道查询词: ',word) tf={} c.execute('select list from word where term=?',(word,)) result=c.fetchall() if len(result)>0: doclist=result[0][0] doclist=doclist.split(' ') doclist=[int(x) for x in doclist] df=len(set(doclist)) idf=math.log(N/df) print('idf: ',idf) for num in doclist: if num in tf: tf[num]=tf[num]+1 else: tf[num]=1 for num in tf: if num in score: score[num]=score[num]+tf[num]*idf else: score[num]=tf[num]*idf sortedlist=sorted(score.items(),key=lambda d:d[1],reverse=True) print('得分列表',sortedlist) cnt=0 cnt1=0 for num,docscore in sortedlist: cnt=cnt+1 c.execute('select link from doc where id=?',(num,)) url=c.fetchall()[0][0] print(url,'得分: ',docscore) try: respnse=request.urlopen(url) content=respnse.read().decode('utf-8') except: print("网页读取出错") continue soup=BeautifulSoup(content,'lxml') title=soup.title if title==None: print('No title.') else: title=title.text print(title) if cnt>10: break if cnt==0: print("无搜索结果")
# search_engine_build-2.py(爬取并保存) import sys from collections import deque import urllib from urllib import request import re from bs4 import BeautifulSoup import lxml import sqlite3 import jieba url = 'https://www.fjut.edu.cn/561/list.htm' # 'http://www.zut.edu.cn'#入口 unvisited = deque() # 待爬取链接的列表,使用广度优先搜索 visited = set() # 已访问的链接集合 unvisited.append(url) conn = sqlite3.connect("viewsdu.db") c = conn.cursor() # 在create table之前先drop table是因为我之前测试的时候已经建过table了,所以再次运行代码的时候得把旧的table删了重新建 #c.execute('drop table doc') c.execute('create table doc (id int primary key,link text)') # c.execute('drop table word') c.execute('create table word (term varchar(25) primary key,list text)') conn.commit() conn.close() print('***************开始!*****************************') cnt = 0 print('开始。。。。。 ') while unvisited: url = unvisited.popleft() visited.add(url) cnt += 1 print('开始抓取第', cnt, '个链接:', url) # 爬取网页内容 try: response = request.urlopen(url) content = response.read().decode('utf-8') except: continue # 寻找下一个可爬的链接,因为搜索范围是网站内,所以对链接有格式要求,这个格式要求根据具体情况而定 # 解析网页内容,可能有几种情况,这个也是根据这个网站网页的具体情况写的 soup = BeautifulSoup(content, 'lxml') all_a = soup.find_all('a', {'target': "_blank"}) # 本页面所有的新闻链接<a> for a in all_a: x = a.attrs['href'] # 网址 if not re.match(r'^/', x): continue x = 'https://www.fjut.edu.cn' + x # print(x) if (x not in visited) and (x not in unvisited): unvisited.append(x) a = soup.find('a', {'class': "next"}) # 下一页<a> if a != None: x = a.attrs['href'] # 网址 x = 'https://www.fjut.edu.cn/' + x if (x not in visited) and (x not in unvisited): unvisited.append(x) title = soup.title article = soup.find('div', class_="Article_Content") if article and article.find_all(re.compile("^p")): all_p = article.find_all(re.compile("^p")) article = "" for p in all_p: p_str = p.get_text("", strip=True) p_str = ''.join(p_str.split()) article += p_str print(article) elif article and article.find_all(re.compile("^div")): all_p = article.find_all(re.compile("^div")) article = "" for p in all_p: p_str = p.get_text("", strip=True) p_str = ''.join(p_str.split()) article += p_str print(article) else: article = '' if title == None: print('无内容的页面。') continue else: title = title.text title = ''.join(title.split()) print('网页标题:', title) # 提取出的网页内容存在title,article字符串里,对它们进行中文分词 seggen = jieba.cut_for_search(title) seglist = list(seggen) seggen = jieba.cut_for_search(article) seglist += list(seggen) # 数据存储 conn = sqlite3.connect("viewsdu.db") c = conn.cursor() c.execute('insert into doc values(?,?)', (cnt, url)) # 对每个分出的词语建立词表 for word in seglist: # print(word) # 检验看看这个词语是否已存在于数据库 c.execute('select list from word where term=?', (word,)) result = c.fetchall() # 如果不存在 if len(result) == 0: docliststr = str(cnt) c.execute('insert into word values(?,?)', (word, docliststr)) # 如果已存在 else: docliststr = result[0][0] # 得到字符串 docliststr += ' ' + str(cnt) c.execute('update word set list=? where term=?', (docliststr, word)) conn.commit() conn.close() print('词表建立完毕')