今天学习了一些简单的爬虫知识,并应用这些知识撸了一爬取古诗的程序
主要使用的第三方库:Request,bs4
直接上代码:
spider.py:
# -*- coding:utf-8 -*- # spider.py import sys import bs4 import requests import re from poem import Poem def getPoem(poemText): p = Poem(poemText) return p def page_turning(): global poem_url_0, page, poem_url poem_url = poem_url_0 + '&page=' + str(page) page += 1 poem_url_0 = 'https://so.gushiwen.cn/shiwens/default.aspx?tstr=&astr=&cstr=&xstr=%e8%af%97' page = 1 poem_url = '' page_turning() def spider(): global poem_url poem_html = requests.get(poem_url) # 获取网页原始HTML文档 # print(poem_html) # <Response [200]> 返回一个Response对象 soup = bs4.BeautifulSoup(poem_html.text, 'html.parser') # bs4 解析HTML文档 textareaTag_lst = soup.find_all('textarea') # 获取古诗文本的标签 poemText_ptn = re.compile(r'>(.[^w]*?《[\u4e00-\u9fa5·]+?》.+?aspx)<') # 产生一个对古诗文本的匹配方法 poemText_lst = poemText_ptn.findall(str(textareaTag_lst)) # 提取古诗文本,生成文本列表 poem_lst = list(map(getPoem, poemText_lst)) print('共爬取到 %d 首古诗词:' % poem_lst[0].poemCout) for p in poem_lst: print('#%2d %s' % (p.index, p.title)) print('#0 爬取下一页') print('#-1 重新爬取') print('#-2 退出') while True: num = int(input('输入编号查看对应详细内容或进行其他操作:')) if num - 1 in range(len(poemText_lst)): poem_lst[num - 1].showPoem() elif num == 0: page_turning() spider() elif num == -1: spider() elif num == -2: sys.exit() else: print('错误:输入的号码有误.') spider()