蜘蛛程序网络爬虫源代码|江阴雨辰互联

2023年6月29日发(作者：)

有一天突然看了htmlparser工具包发现果然强大。由于不是很熟悉所以下面代码或许写的有点烂。

首先做准备工作先写一个实体bean

package bean; ;

/**

* @authorJeson

* blog

* @date:Oct 9, 2009 3:09:19 PM

* @version :1.0

publicclassArtical {

private String title;

private String body;

private String link;

private String author;

private String [] tags;

private Date dCreate;

public String getTitle() {

return title;

}

publicvoidsetTitle(String title) {

= title;

}

public String getBody() {

return body;

}

publicvoidsetBody(String body) {

= body;

}

public String getLink() {

return link;

}

publicvoidsetLink(String link) {

= link;

}

public String getAuthor() {

return author;

}

publicvoidsetAuthor(String author) {

= author;

}

public String[] getTags() {

return tags;

}

publicvoidsetTags(String[] tags) {

= tags;

}

public Date getDCreate() {

returndCreate;

}

publicvoidsetDCreate(Date create) {

dCreate = create;

}

2 写一个我们下面要用到的字符串处理类

packageutil;edReader;

ption;

tream;

treamReader;

ortedEncodingException;

/**

* @authorJeson

* blog * @date:Oct 9, 2009 3:09:19 PM * @version :1.0

publicclassStringUtil {

/**

* 使用正则匹配字符串

* @param regex

* 正则表达式

* @param txt

* 要验证的字符串

* @return 匹配则返回真否则返回假

publicstaticbooleanuseRegex(String regex, String txt) {

Pattern p = e(regex);

Matcher m = r(txt);

s();

}

/**

* 使用正则匹配字符串

* @param regex

* 正则表达式 *

* @param index

* 要取第几个元素

* @param txt

* 要验证的字符串

* @return 返回匹配的字符串

publicstatic String getByRegex(String regex, int index, String txt)

{

Pattern p = e(regex);

Matcher m = r(txt);

if (()) {

(index);

}

returnnull;

}

/**

* 使用正则匹配字符串

* @param regex

* 正则表达式 *

* @param index

* 要取第几个元素

* @param txt

* 要验证的字符串

* @return 返回匹配的字符串数组

publicstatic String [] getStringsByRegex(String regex, int [] index,

String txt) {

String res [] = new String[];

Pattern p = e(regex);

Matcher m = r(txt);

if (()) {

for(int i : index){

res[i] = (i);

}

return res;

}

3 下面是我们的核心类他会去抓取cnblogs的页面并保存

package test; ;

tFoundException;

tputStream;

ption;

Stream;

lter;

;

ributeFilter;

st;

Util;

/**

* @authorJeson

* @blog

* @date:Oct 9, 2009 1:08:10 PM

* @version :1.0

publicclass Parse {

privatestaticfinalint MAX_PAGE = 20;

privatefinal String ENCODING = "UTF-8";

/**

* @paramargs

publicstaticvoid main(String[] args) {

try {

for(int i=1;i

new Parse().testAttribute(i);

}

} catch (Exception e) {

// TODO Auto-generated catch block

tackTrace();

}

privatevoidtestAttribute(int pa) throws Exception{

n("————开始解析页面："+pa);

Parser p = new Parser();

("/cate/java/?page="+pa);

oding("UTF-8");

NodeFilter filter = newHasAttributeFilter("class","titlelnk");

NodeList list = tAllNodesThatMatch(filter);

n(());

intcou = 0;

for(int i=0 ; i<();i++){

String html = tAt(i).toHtml(true);

int [] index = {0,1,2};

String [] bs = ingsByRegex("

class="titlelnk" href="(.*)" target="_blank">(.*)",

index, html);

String title = bs[2];

String url = bs[1];

n(url);

String content = getContent(bs[1]);

if(content == null || "".equals(content)){

continue;

}

Artical art = newArtical();

le(title);

y(content);

k(url);

createFile(art);

n("=========="+(i+1)+"============");

n("title==>"+bs[2]);

n("url==>"+bs[1]);

n("content==>"+getContent(bs[1]));

n("======================");

n();

cou++;

}

n("over"+cou);

}

private String getContent(String url) throws Exception{

Parser p = new Parser();

(url);

oding(ENCODING);

NodeFilter filter = newHasAttributeFilter("class","post");

NodeList list = tAllNodesThatMatch(filter);

String a = ();

return a;

}

privatevoidcreateFile(Artical art){

try {

File d = new File("d:cnblog");

if(!()){

();

}

File f = new

File("d:cnblog"+le()+".html");

if(!()){

NewFile();

n("——–>"+le()+"文件已经创建");

}

OutputStream file = newFileOutputStream(h());

(y().getBytes());

();

n("文件写入完毕，地址"+h());

} catch (FileNotFoundException e) {

tackTrace();

} catch (IOException e) {

tackTrace();

n(k()+" "+le()+"文件写入失败");

}

发布者：admin，转转请注明出处：http://www.yc00.com/xiaochengxu/1687982288a63489.html

蜘蛛程序网络爬虫源代码

发表回复

评论列表（0条）

联系我们

400-800-8888

蜘蛛程序 网络爬虫 源代码

相关推荐

发表回复

评论列表（0条）

联系我们

400-800-8888

蜘蛛程序网络爬虫源代码