package com.dapeng.method; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import com.dapeng.bean.Article; public class UtilMethod { /** * Document* @param url * @return Document对象 */ public static Document getDocument(String url)
{ Document doc = null; try { doc = Jsoup.connect(url).timeout(5000).get(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace();
} return doc; } /** * 根据获得的Document对象找到章节标题 * @param doc * @return 标题 */ public static String getTitle(Document doc){ return doc.getElementById("title").text();
} /** * * @param doc * @return 内容 */ public static String getContent(Document doc){ if(doc.getElementById("content") != null){ return doc.getElementById("content").text();
}else{ return null; } } /** * 根据获得的Document对象找到下1章的Url地址 * @param doc * @return 下1章Url */ public static String getNextUrl(Document doc){ Element ul = doc.select("ul").first();
String regex = "<li><a href="(.*?)">下1页</a></li>"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(ul.toString()); Document nextDoc = null;
if (matcher.find()) { nextDoc = Jsoup.parse(matcher.group()); Element href = nextDoc.select("a").first(); return "http://www.bxwx.org/b/5/5131/" + href.attr("href"); }else{ return null;
} } /** * 根据url获得id * @param url * @return id */ public static String getId(String url){ String urlSpilts[] = url.split("/"); return (urlSpilts[urlSpilts.length - 1]).split(".")[0]; } /** *
@param url * @return */ public static Article getArticle(String url){ Article article = new Article(); article.setUrl(url); Document doc = getDocument(url); article.setId(getId(url));
article.setTitle(getTitle(doc)); article.setNextUrl(getNextUrl(doc)); article.setContent(getContent(doc)); return article; } }package com.dapeng.bean;
public class Article { private String id;//id private String title;//标题 private String content;//内容 private String url;//当前章节url private String nextUrl;
@Override public String toString() { return "Article [id=" + id + ", title=" + title + ", content=" + content + ", url=" + url + ", nextUrl=" + nextUrl + "]"; }
}package com.dapeng.test; import com.dapeng.bean.Article; import com.dapeng.method.UtilMethod;
public class GetArticles { /** * @param args */ public static void main(String[] args)
{ // TODO Auto-generated method stub String firstUrl = "http://www.bxwx.org/b/5/5131/832882.html";
Article article = UtilMethod.getArticle(firstUrl); while(article.getNextUrl() != null && article.getContent() != null && !article.getId().equals("996627")){ article = UtilMethod.getArticle(article.getNextUrl());
System.out.println(article.getId()+"----"+article.getTitle()); } } }