目标: 获取CSDN首页直播内容
实现获取内容的类 Crowller
superagent
插件获取页面信息fs.writeFileSync
根据以上两点实现的类:
import fs from 'fs'; import path from 'path'; import superagent from 'superagent'; import DellAnalyzer from './dellAnalyzer'; export interface Analyzer { analyze: (html: string, filePath: string) => string; } class Crowller { private filePath = path.resolve(__dirname, '../data/live.json'); // 存储的文件位置 // 获取页面信息 private async getRawHtml() { const result = await superagent.get(this.url); return result.text; } // 将爬取到的内容写入文件 private writeFile(content: string) { fs.writeFileSync(this.filePath, content); } // 初始化 private async initSpiderProcess() { const html = await this.getRawHtml(); const fileContent = this.analyzer.analyze(html, this.filePath); // 处理需要获取的页面内容 this.writeFile(fileContent); } constructor(private url: string, private analyzer: Analyzer) { this.initSpiderProcess(); } } const url = `https://www.csdn.net/?spm=3001.4476`; // 获取csdn首页直播信息 const analyzer = DellAnalyzer.getInstance(); // 单例模式获取 new Crowller(url, analyzer); // 创建一个实例执行
cheerio
插件,以类似jQuery的方式获取节点信息,以储存需要的内容import fs from 'fs'; import cheerio from 'cheerio'; // 获取页面内容的工具 import { Analyzer } from './crowller'; interface Live { title: string; time: string; } interface LiveResult { time: number; data: Live[]; } interface Content { [propName: number]: Live[]; } export default class DellAnalyzer implements Analyzer { private static instance: DellAnalyzer; static getInstance() { if (!DellAnalyzer.instance) { DellAnalyzer.instance = new DellAnalyzer(); } return DellAnalyzer.instance; } // 获取直播名字跟时间 private getLiveInfo(html: string) { const $ = cheerio.load(html); const courseItems = $('.www_live_item'); const courseInfos: Live[] = []; courseItems.map((index, element) => { const title = $(element).find('h3').text(); const time = $(element).find('.text').text(); courseInfos.push({ title, time }); }); return { time: new Date().getTime(), data: courseInfos, }; } // 处理需要存储的内容 private generateJsonContent(liveInfo: LiveResult, filePath: string) { let fileContent: Content = {}; // 文件存在则追加内容 if (fs.existsSync(filePath)) { fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8')); } fileContent[liveInfo.time] = liveInfo.data; return fileContent; } public analyze(html: string, filePath: string) { const liveInfo = this.getLiveInfo(html); const fileContent = this.generateJsonContent(liveInfo, filePath); return JSON.stringify(fileContent); } private constructor() {} }
tsc
: 将ts
文件 转换成js
文件nodemon
: 自动检测到目录中的文件更改时通过重新启动concurrently
:同时执行操作命令相关配置信息:
{ "scripts": { "dev:build": "tsc -w", // 编译 "dev:start": "nodemon node ./build/crowller.js", // 执行 "dev": "concurrently npm:dev:*" }, "nodemonConfig": { "ignore": [ "data/*" ] }, }
通过npm run dev
,就能够实现在修改ts文件的时候编译出js文件,并且执行crowller
代码