Spider-Flow平台以流程图的方式定义爬虫,无需再编码爬虫,是一个高度灵活可配置的爬虫平台
源代码:https://github.com/ssssssss-team/spider-flow
JDK >= 1.8
Mysql >= 5.7
Maven >= 3.0
1、在Mysql创建数据库和相关数据表
SET FOREIGN_KEY_CHECKS=0; CREATE DATABASE spiderflow; USE spiderflow; DROP TABLE IF EXISTS `sp_flow`; CREATE TABLE `sp_flow` ( `id` varchar(32) NOT NULL, `name` varchar(64) DEFAULT NULL COMMENT '任务名字', `xml` longtext DEFAULT NULL COMMENT 'xml表达式', `cron` varchar(255) DEFAULT NULL COMMENT 'corn表达式', `enabled` char(1) DEFAULT '0' COMMENT '任务是否启动,默认未启动', `create_date` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `last_execute_time` datetime DEFAULT NULL COMMENT '上一次执行时间', `next_execute_time` datetime DEFAULT NULL COMMENT '下一次执行时间', `execute_count` int(8) DEFAULT NULL COMMENT '定时执行的已执行次数', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '爬虫任务表'; DROP TABLE IF EXISTS `sp_datasource`; CREATE TABLE `sp_datasource` ( `id` varchar(32) NOT NULL, `name` varchar(255) DEFAULT NULL, `driver_class_name` varchar(255) DEFAULT NULL, `jdbc_url` varchar(255) DEFAULT NULL, `username` varchar(64) DEFAULT NULL, `password` varchar(32) DEFAULT NULL, `create_date` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; DROP TABLE IF EXISTS `sp_variable`; CREATE TABLE `sp_variable` ( `id` int(11) NOT NULL AUTO_INCREMENT, `name` varchar(32) DEFAULT NULL COMMENT '变量名', `value` varchar(512) DEFAULT NULL COMMENT '变量值', `description` varchar(255) DEFAULT NULL COMMENT '变量描述', `create_date` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4; /* v0.3.0 新增 */ DROP TABLE IF EXISTS `sp_task`; CREATE TABLE `sp_task` ( `id` int(11) NOT NULL AUTO_INCREMENT, `flow_id` varchar(32) NOT NULL, `begin_time` datetime DEFAULT NULL, `end_time` datetime DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=7 DEFAULT CHARSET=utf8mb4; /* v0.4.0 新增 */ DROP TABLE IF EXISTS `sp_function`; CREATE TABLE `sp_function` ( `id` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL, `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '函数名', `parameter` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '参数', `script` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT 'js脚本', `create_date` datetime(0) NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic; /* v0.5.0 新增 */ DROP TABLE IF EXISTS `sp_flow_notice`; CREATE TABLE `sp_flow_notice` ( `id` varchar(32) NOT NULL, `recipients` varchar(200) DEFAULT NULL COMMENT '收件人', `notice_way` char(10) DEFAULT NULL COMMENT '通知方式', `start_notice` char(1) DEFAULT '0' COMMENT '流程开始通知:1:开启通知,0:关闭通知', `exception_notice` char(1) DEFAULT '0' COMMENT '流程异常通知:1:开启通知,0:关闭通知', `end_notice` char(1) DEFAULT '0' COMMENT '流程结束通知:1:开启通知,0:关闭通知', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '爬虫任务通知表';
2、将源码下载放到本地,修改pom.xml,将mysql的依赖配置修改为使用的版本mysql,我这里使用的是8.0.27版本的mysql
<!-- 数据库相关 --> <dependency> <groupId>com.baomidou</groupId> <artifactId>mybatis-plus-boot-starter</artifactId> <version>${mybatis.plus.version}</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.27</version> <scope>runtime</scope> </dependency>
3、修改spider-flow-web/src/main/resources
下的application.propeties;
( 如果mysql版本在6以下,spring.datasource.driver-class-name
配置为com.mysql.jdbc.Driver
)
server.port=8088 ...... spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver spring.datasource.username=root spring.datasource.password=****** spring.datasource.url=jdbc:mysql://localhost:3306/spider_flow?useSSL=false&useUnicode=true&characterEncoding=UTF8&autoReconnect=true
在最上级目录spider-flow-master/
,打包创建JAR包
mvn clean package
打包完成后,在spider-flow-master/spider-flow-web/target/
路径下会有一个spider-flow.jar,进入该目录,输入命令执行JAR包
java -jar spider-flow.jar
当看到以下日志信息即表示JAR包执行成功
Tomcat started on port(s): 8088 (http) with context path '' Started SpiderApplication in 8.278 seconds (JVM running for 9.067)
最后浏览器访问:http://localhost:8088/ 即可使用平台
文章参考
https://smile.blog.csdn.net/article/details/121987245