2023年6月29日发(作者:)
有一天突然看了htmlparser工具包发现果然强大。由于不是很熟悉 所以下面代码或许写的有点烂。
首先做准备工作 先写一个实体bean
package bean; ;
/**
* @authorJeson
* blog
* @date:Oct 9, 2009 3:09:19 PM
* @version :1.0
*
*/
publicclassArtical {
private String title;
private String body;
private String link;
private String author;
private String [] tags;
private Date dCreate;
public String getTitle() {
return title;
}
publicvoidsetTitle(String title) {
= title;
}
public String getBody() {
return body;
}
publicvoidsetBody(String body) {
= body;
}
public String getLink() {
return link;
}
publicvoidsetLink(String link) {
= link;
}
public String getAuthor() {
return author;
}
publicvoidsetAuthor(String author) {
= author;
}
public String[] getTags() {
return tags;
}
publicvoidsetTags(String[] tags) {
= tags;
}
public Date getDCreate() {
returndCreate;
}
publicvoidsetDCreate(Date create) {
dCreate = create;
}
}
2 写一个我们下面要用到的字符串处理类
packageutil;edReader;
ption;
tream;
treamReader;
ortedEncodingException;
r;
n;
/**
* @authorJeson
* blog * @date:Oct 9, 2009 3:09:19 PM * @version :1.0
*/
publicclassStringUtil {
/**
* 使用正则匹配字符串
*
* @param regex
* 正则表达式
* @param txt
* 要验证的字符串
* @return 匹配则返回真 否则返回假
*/
publicstaticbooleanuseRegex(String regex, String txt) {
Pattern p = e(regex);
Matcher m = r(txt);
s();
}
/**
* 使用正则匹配字符串
*
* @param regex
* 正则表达式 *
* @param index
* 要取第几个元素
* @param txt
* 要验证的字符串
* @return 返回匹配的字符串
*/
publicstatic String getByRegex(String regex, int index, String txt)
{
Pattern p = e(regex);
Matcher m = r(txt);
if (()) {
(index);
}
returnnull;
}
/**
* 使用正则匹配字符串
*
* @param regex
* 正则表达式 *
* @param index
* 要取第几个元素
* @param txt
* 要验证的字符串
* @return 返回匹配的字符串数组
*/
publicstatic String [] getStringsByRegex(String regex, int [] index,
String txt) {
String res [] = new String[];
Pattern p = e(regex);
Matcher m = r(txt);
if (()) {
for(int i : index){
res[i] = (i);
}
}
return res;
}
}
3 下面是我们的核心类 他会去抓取cnblogs的页面并保存
package test; ;
tFoundException;
tputStream;
ption;
Stream;
lter;
;
ributeFilter;
st;
l;
Util;
/**
* @authorJeson
* @blog
* @date:Oct 9, 2009 1:08:10 PM
* @version :1.0
*
*/
publicclass Parse {
privatestaticfinalint MAX_PAGE = 20;
privatefinal String ENCODING = "UTF-8";
/**
* @paramargs
*/
publicstaticvoid main(String[] args) {
try {
for(int i=1;i new Parse().testAttribute(i); } } catch (Exception e) { // TODO Auto-generated catch block tackTrace(); } } privatevoidtestAttribute(int pa) throws Exception{ n("————开始解析页面:"+pa); Parser p = new Parser(); ("/cate/java/?page="+pa); oding("UTF-8"); NodeFilter filter = newHasAttributeFilter("class","titlelnk"); NodeList list = tAllNodesThatMatch(filter); n(()); intcou = 0; for(int i=0 ; i<();i++){ String html = tAt(i).toHtml(true); int [] index = {0,1,2}; String [] bs = ingsByRegex(" class="titlelnk" href="(.*)" target="_blank">(.*) index, html); String title = bs[2]; String url = bs[1]; n(url); String content = getContent(bs[1]); if(content == null || "".equals(content)){ continue; } Artical art = newArtical(); le(title); y(content); k(url); createFile(art); n("=========="+(i+1)+"============"); n("title==>"+bs[2]); n("url==>"+bs[1]); n("content==>"+getContent(bs[1])); n("======================"); n(); cou++; } n("over"+cou); } private String getContent(String url) throws Exception{ Parser p = new Parser(); (url); oding(ENCODING); NodeFilter filter = newHasAttributeFilter("class","post"); NodeList list = tAllNodesThatMatch(filter); String a = (); return a; } privatevoidcreateFile(Artical art){ try { File d = new File("d:cnblog"); if(!()){ (); } File f = new File("d:cnblog"+le()+".html"); if(!()){ NewFile(); n("——–>"+le()+"文件已经创建"); } OutputStream file = newFileOutputStream(h()); (y().getBytes()); (); (); n("文件写入完毕,地址"+h()); } catch (FileNotFoundException e) { tackTrace(); } catch (IOException e) { tackTrace(); n(k()+" "+le()+"文件写入失败"); } } }
发布者:admin,转转请注明出处:http://www.yc00.com/xiaochengxu/1687982288a63489.html
评论列表(0条)