jdk 1.8
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
/** * 爬取数据线程池 */ public static ExecutorService exec = Executors.newFixedThreadPool(10);
log.info("从数据库获取爬取url列表"); Example example = new Example(DemoPO.class); example.createCriteria().andEqualTo("type",NUMONE); List<DemoPO> poList = DemoMapper.selectByExample(example);
CompletionService<List<DemoPO>> everyWeekCs = new ExecutorCompletionService<>(exec);
不同的url 具体怎么解析有差别
for (DemoPOpo : poList) { if("test1".equals(po.getSource())){ everyWeekCs.submit(()->getEveryWeekPoFromDocument(po,NUMONE,day)); } if("test2".equals(po.getSource())){ everyWeekCs.submit(()->getEveryWeekPoFromDocument(po,NUMTWO,day)); } if("test3".equals(po.getSource())){ everyWeekCs.submit(()->getEveryWeekPoFromDocument(po,NUMTHREE,day)); } }
用Jsoup的api,根据页面标签来解析获取数据
List<DemoPO> list = new ArrayList<>(); String url = po.getUrl(); Connection connection = Jsoup.connect(url); Connection.Response response = connection.execute(); if(response.statusCode() == 200) { Document doc = connection.get(); List<Element> elements = doc.getElementsByClass("xlayer02 yh ohd clear"); for (Element element : elements) { DemoPO demoPo = new DemoPO(); String title = element.select("a").text(); po.setTitle(title); String contentUrl = element.select("a").attr("href"); Connection con = Jsoup.connect("http:" + contentUrl); Connection.Response res = con.execute(); if (res.statusCode() == 200) { Document contentDoc = con.get(); String content = contentDoc.getElementsByClass("xcc font14 yh ohd clear").get(0).getElementsByTag("p").toString(); po.setContent(content); list.add(po); } } } return list;
List<DemoPO> list = new ArrayList<>(); //按任务完成顺序获取值,减少阻塞获取值的所需时间 for (int i = 0;i<poList.size();i++){ list.addAll(everyWeekCs.take().get()); }