Ale*_*nik 7 web-crawler node.js
我想在 node.js 中使用爬虫来爬取网站中的所有链接(内部链接)并获取每个页面的标题,我在 npm crawler上看到了这个插件,如果我检查文档,则有以下示例:
var Crawler = require("crawler");
var c = new Crawler({
maxConnections : 10,
// This will be called for each crawled page
callback : function (error, res, done) {
if(error){
console.log(error);
}else{
var $ = res.$;
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
console.log($("title").text());
}
done();
}
});
// Queue just one URL, with default callback
c.queue('http://balenol.com');
Run Code Online (Sandbox Code Playgroud)
但我真正想要的是抓取网站中的所有内部网址,并且是内置在这个插件中还是需要单独编写?我在插件中没有看到任何选项可以访问站点中的所有链接,这可能吗?
以下代码段会抓取它找到的每个 URL 中的所有 URL。
const Crawler = require("crawler");
let obselete = []; // Array of what was crawled already
let c = new Crawler();
function crawlAllUrls(url) {
console.log(`Crawling ${url}`);
c.queue({
uri: url,
callback: function (err, res, done) {
if (err) throw err;
let $ = res.$;
try {
let urls = $("a");
Object.keys(urls).forEach((item) => {
if (urls[item].type === 'tag') {
let href = urls[item].attribs.href;
if (href && !obselete.includes(href)) {
href = href.trim();
obselete.push(href);
// Slow down the
setTimeout(function() {
href.startsWith('http') ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`) // The latter might need extra code to test if its the same site and it is a full domain with no URI
}, 5000)
}
}
});
} catch (e) {
console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
done()
}
done();
}
})
}
crawlAllUrls('https://github.com/evyatarmeged/');
Run Code Online (Sandbox Code Playgroud)