2023年6月29日发(作者:)
⽤js写个爬⾍如何⽤js完成爬⾍项⽬前⾔这是我第⼀次尝试写爬⾍,爬⾍代码肯定有很多地⽅写得不太好,不过思路⼤致是这个样⼦,之后我还会做些优化。在这⾥我⽤的vscode写的⼀个新闻⽹站的爬⾍。可以⾃⾏安装vscode。开始之前还要做些准备⼯作。⼀、的安装⼆、mysql的安装三、确定爬取⽹页四、查看分析⽹页源码在该新闻⽹页右键查看源代码。如:分析我们需要爬取的内容。五、开始写爬⾍npm···我们需要先在终端输⼊npm i request npm i cheerio npm i iconv-lite npm i data-utils;来安装这些我们需要的包。下⾯就是爬⾍⾥我们要引⽤的包代码如下(⽰例):var fs = require('fs');var myRequest = require('request')//获取⽹页内容var myCheerio = require('cheerio')//筛选⽹页信息var myIconv = require('iconv-lite')require('date-utils');Node调⽤mysqlnpm install mysql;创建⽂件代码如下(⽰例):var mysql = require("mysql");var pool = Pool({ host: '127.0.0.1', user: 'root', password: 'root', database: 'crawl'});var query = function(sql, sqlparam, callback) { nection(function(err, conn) { if (err) { callback(err, null, null); } else { (sql, sqlparam, function(qerr, vals, fields) { e(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
}); } });};var query_noparam = function(sql, callback) { nection(function(err, conn) { if (err) { callback(err, null, null); } else { (sql, function(qerr, vals, fields) { e(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
}); } });}; = query;_noparam = query_noparam;定义要访问的⽹站var source_name = "新华⽹";var myEncoding = "utf-8";var seedURL = '/';定义新闻页⾯⾥具体的元素的读取⽅式定义哪些url可以作为新闻页⾯var seedURL_format = "$('a')";var keywords_format = " $('meta[name="keywords"]').eq(0).attr("content")";var title_format = "$('title').text()";var author_format = "$('.editor').text()";var content_format = "$('.left_zw').text()";var date_format = "$('.info').text()";var desc_format = " $('meta[name="description"]').eq(0).attr("content")";var source_format = "$('#source_baidu').text()";var url_reg = /d{4}-d{2}d{2}/;var regExp = /((d{4}|d{2})(-||.)d{1,2}3d{1,2})|(d{4}年d{1,2}⽉d{1,2}⽇)/url_reg后⾯的正则表达式可以根据新闻⽹页链接的特征表⽰。我这⾥通过链接上有年⽉⽇的时间这个特征来表⽰的。构造⼀个模仿浏览器的request//request模块异步fetch urlfunction request(url, callback) { var options = { url: url, encoding: null, //proxy: ':8080', headers: headers, timeout: 10000 // } myRequest(options, callback)}读取种⼦页⾯解析出种⼦页⾯⾥所有的request(seedURL, function (err, res, body) { //读取种⼦页⾯ // try { //⽤iconv转换编码 var html = (body, myEncoding); //(html); //准备⽤cheerio解析html var $ = (html, { decodeEntities: true }); // } catch (e) { ('读种⼦页⾯并转码出错:' + e) }; var seedurl_news; try { seedurl_news = eval(seedURL_format); //(seedurl_news); } catch (e) { ('url列表所处的html块识别出错:' + e) };遍历种⼦页⾯⾥所有的规整化所有链接,如果符合新闻URL的正则表达式就爬取request(seedURL, function (err, res, body) { //读取种⼦页⾯ // try { //⽤iconv转换编码 var html = (body, myEncoding); //(html); //准备⽤cheerio解析html var $ = (html, { decodeEntities: true }); // } catch (e) { ('读种⼦页⾯并转码出错:' + e) }; var seedurl_news; try { seedurl_news = eval(seedURL_format); //(seedurl_news); } catch (e) { ('url列表所处的html块识别出错:' + e) }; seedurl_(function (i, e) { //遍历种⼦页⾯⾥所有的a链接 var myURL = ""; try { //得到具体新闻url var href = ""; href = $(e).attr("href"); if (typeof (href) == "undefined") { //
有些⽹页地址undefined return true; } if (rCase().indexOf('') >= 0 || rCase().indexOf('') >= 0) myURL = href; //开头的或者开头 else if (With('//')) myURL = 'http:' + href;
开头的 else myURL = (0, dexOf('/') + 1) + href; //其他 } catch (e) { ('识别种⼦页⾯中的新闻链接出错:' + e) } if (!url_(myURL)) return; //检验是否符合新闻url的正则表达式 //(myURL); newsGet(myURL); //读取新闻页⾯ });});读取具体的新闻页⾯,构造⼀个空的fetch对象⽤于存储数据function newsGet(myURL) { //读取新闻页⾯ request(myURL, function (err, res, body) { //读取新闻页⾯ //try { var html_news = (body, myEncoding); //⽤iconv转换编码 //(html_news); //准备⽤cheerio解析html_news var $ = (html_news, { decodeEntities: true }); myhtml = html_news; //} catch (e) { ('读新闻页⾯并转码出错:' + e);}; ("转码读取成功:" + myURL); //动态执⾏format字符串,构建json对象准备写⼊⽂件或数据库 var fetch = {}; = ""; t = ""; h_date = (new Date()).toFormat("YYYY-MM-DD"); // = myhtml; = myURL; _name = source_name; _encoding = myEncoding; //编码 ime = new Date();读取新闻页⾯中的元素并保存到fetch对象⾥ if (keywords_format == "") ds = source_name; // eval(keywords_format); //没有关键词就⽤sourcename else ds = eval(keywords_format); if (title_format == "") = "" else = eval(title_format); //标题 if (date_format == "") h_date = "" else = eval(date_format);
try { if (author_format == "") = source_name; //eval(author_format); //作者 else = eval(author_format); if (content_format == "") t = ""; else t = eval(content_format).replace("rn" + , ""); //内容,是否要去掉作者信息⾃⾏决定 if (source_format == "") = _name; else = eval(source_format).replace("rn", ""); //来源 if (desc_format == "") = ; else = eval(desc_format).replace("rn", ""); //摘要
} catch (e) { return; } // var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") + // "_" + (dexOf('/') + 1) + ".json"; //
存储json // ileSync(filename, ify(fetch)); var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' + 'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)'; var fetchAddSql_Params = [, _name, _encoding, , ds, , h_date, at("YYYY-MM-DD HH24:MI:SS"), t ]; //执⾏sql,数据库中fetch表⾥的url属性是unique的,不会把重复的url内容写⼊数据库 (fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) { if (qerr) { (qerr); } }); //mysql写⼊
});}我这⾥try{}catch(e){return;}就是如果抓取不到的就跳过了;到这⾥爬⾍已经差不多写好了六、构建⽹站访问数据库中爬取到的内容⾸先创建⼀个作为⽹页端(前端)代码如下(⽰例):
再创建⼀个作为后端代码如下(⽰例):var express = require('express');var mysql = require('./')var app = express();var cors = require('cors');(cors());//(('public'));('/', function(req, res) { le(__dirname + "/" + "");})('/', function(req, res) { le(__dirname + "/" + "");})('/process_get', function(req, res) { ead(200, { 'Content-Type': 'text/html;charset=utf-8' }); //设置res编码为utf-8 //sql字符串和参数 var fetchSql = "select url,source_name,title,author,publish_date from fetches where title like '%" + + "%'"; (fetchSql); (fetchSql, function(err, result, fields) { (result); (ify(result)); });})var server = (8080, function() { ("访问地址为 127.0.0.1:8080/")})Node运⾏后访问127.0.0.1:8080/点击submit后得到⽤express脚⼿架来创建⼀个⽹站框架在命令⾏中之前⽂件的⽬录下express –e search_site⽣成出⼀个search_site的⽂件夹。由于我们需要使⽤mysql,因此将拷贝进这个⽂件夹。拷贝后还需要在search_site⽂件夹内cmd运⾏。依次进⾏:npm install mysql –savenpm install⽤vscode打开⽂件search_site/routes/添加('/process_get', function(request, response) { //sql字符串和参数 var fetchSql = "select url,source_name,title,author,publish_date " + "from fetches where title like '%" + + "%'"; (fetchSql, function(err, result, fields) { ead(200, { "Content-Type": "application/json" }); (ify(result)); (); });});在search_site/public/下创建⼀个发布者:admin,转转请注明出处:http://www.yc00.com/web/1687981763a63425.html
评论列表(0条)