注:先获取每一章的url,将每一章标题及url存储在.csv文件中,再遍历.csv文件,爬取每一章内容并保存在.txt文件中
lxml.etree._Element
类型的列表,可以进行列表操作,包含标签节点下所有同名的元素if
进行判断with open(path,'mode') as fp
,可以自动关闭文件,
英文逗号时,储存为csv格式需要多加注意不可作文件名的符号
时,需要作出修改(中文的?
可作为文件名)iloc[m,n]
//m行n列dict(zip(key,value))
if __name__ == '__main__':
os.listdir(dirpath)
jieba
库dict.get(key, default=None)
可以对此查找并对key
对应的value
赋值list(dict.items())
对字典的键值对进行列表化,键值对转化为元组类型list.sort(key = lambda x:x[1],reverse = True)
对列表进行降序排列,其中的x
是列表中的每一个元素{:<5}.format()
表示左对齐的5个字符数。/
来链接成字符串数据爬取
from lxml import etree import requests import pandas as pd import os headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36' } def getChapter(url): req = requests.get(url=url,headers=headers) req.encoding = 'utf-8' html = req.text tree = etree.HTML(html) chapter_data = {} partList = tree.xpath('/html/body/div')[8:20] for part in partList: liList = part.xpath('./div/ul/li') for li in liList: if li.xpath('./span/a/text()'): section_name = li.xpath('./span/a/text()')[0] if li.xpath('./span/a/@href'): section_url = li.xpath('./span/a/@href')[0] chapter_data[section_name] = section_url csv_path = 'Src/盗墓笔记小说/盗墓笔记目录.csv' with open(csv_path,'a') as fp: [fp.write('{0},{1}\n'.format(title,url)) for title,url in chapter_data.items()] print("Successfully get the chapterData!") return chapter_data def getContent(chapter_data): if not os.path.exists('Src/盗墓笔记小说/盗墓笔记'): os.makedirs('Src/盗墓笔记小说/盗墓笔记') for title,url in chapter_data.items(): req = requests.get(url = url,headers = headers) req.encoding = 'utf-8' html = req.text tree = etree.HTML(html) p_list = tree.xpath('/html/body/div[7]/div[@class="content"]/p')[1:-4] text_path = 'Src/盗墓笔记小说/盗墓笔记/'+title+'.txt' with open(text_path,'a') as fp: fp.write(title+'\n') for p in p_list: if p.xpath('./text()'): content = p.xpath('./text()')[0] fp.write(content+'\n') print(title+'保存成功!') print("Successfully get the Content!") def main(): # root_url = 'http://www.daomubiji.org/' # chapter_data = getChapter(root_url) df=pd.read_csv("Src\盗墓笔记小说\盗墓笔记目录.csv",header=None,encoding='gbk') key=[] value=[] for i in df.iloc[1,0]: #为避免误触发,此处修改了读取的行数 key.append(i) for j in df.iloc[1,1]: value.append(j) chapter_data = dict(zip(key,value)) getContent(chapter_data) if __name__ == '__main__': main()
词云制作
import jieba as jb from wordcloud import WordCloud as wc import matplotlib.pyplot as plt import os root_path = "Src/盗墓笔记小说/盗墓笔记" def getData(): counts = {} if os.path.exists(root_path): files_name = os.listdir(root_path) # print(files_name[2]) for file_name in files_name: data_path = root_path + "/" + file_name # print(data_path) with open (data_path,'r',encoding='gbk') as fp: text = fp.read() words = jb.lcut(text) # print(words) for word in words: if len(word) < 2: continue else: counts[word] = counts.get(word,0) + 1 items = list(counts.items()) # print(items) items.sort(key = lambda x:x[1],reverse=True) # print(items) # print("{0:<5}{1:<8}{2:<5}".format('序号','词语', '频率')) # for i in range(20): # word,count = items[i] # print("{0:<5}{1:<8}{2:>5}".format(i+1,word,count)) return items def getCloud(items): str = [] for item in items: if item[1] > 100: word = item[0] str.append(word) text_cut = '/'.join(str) cloud = wc( background_color='white', font_path = 'Src\盗墓笔记小说\fonts\simsun.ttc', max_font_size=150, relative_scaling=0.6, width=1000, height=860, margin=2).generate(text_cut) plt.imshow(cloud) plt.axis('off') plt.show() def main(): items = getData() getCloud(items) if __name__ == "__main__": main()
参考资料:
[1].https://zhuanlan.zhihu.com/p/265100275;
[2].https://zhuanlan.zhihu.com/p/138356932;