Commit 7e9e9e25 authored by Xiaowu Zhang's avatar Xiaowu Zhang

crawler.js: crawler http header

parent 3a894868
......@@ -3,6 +3,7 @@
// SPDX-License-Identifier: GPL-3.0-or-later
var fs = require("fs");
const cheerio = require('cheerio');
var args = require("yargs")
.usage("Usage : crawler.js -link website_link -f file_name")
.demandOption(['link'])
......@@ -39,15 +40,14 @@ crawler.maxConcurrency = 5;
crawler.maxDepth = depth;
crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
var mime = response.headers["content-type"];
if (mime.startsWith("text/html") || mime.startsWith("text/plain")) {
readline.cursorTo(process.stdout, 0);
process.stdout.write(count + "");
count+=1;
url_list.push({
"loc": queueItem.url
});
}
readline.cursorTo(process.stdout, 0);
process.stdout.write(count + "");
count+=1;
url_list.push({
"loc": queueItem.url,
"stateData": queueItem.stateData,
"referrer": queueItem.referrer
});
});
// Fire callback
......@@ -56,7 +56,6 @@ crawler.on("complete", function() {
console.log("crawled " + url_list.length + " urls");
var xml_dict = {
"urlset": {
"@xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
"url": url_list
}
};
......@@ -66,10 +65,32 @@ crawler.on("complete", function() {
});
crawler.on("fetcherror", function(queueItem, response) {
console.log("");
console.log("Error " + response.statusCode + " while fetching " + queueItem.url);
url_list.push({
"loc": queueItem.url,
"stateData": queueItem.stateData,
"referrer": queueItem.referrer
});
});
crawler.discoverResources = function(buffer, queueItem) {
var $ = cheerio.load(buffer.toString("utf8"));
var tag_a = $("a[href]").map(function () {
return $(this).attr("href");
}).get();
var tag_link = $("link[href]").map(function () {
return $(this).attr("href");
}).get();
var tag_script = $("script[src]").map(function () {
return $(this).attr("src");
}).get();
return tag_a.concat(tag_link).concat(tag_script);
};
// Start Crawl
crawler.start();
......
......@@ -12,6 +12,7 @@
"dependencies": {
"simplecrawler": "^1.1.9",
"xmlbuilder": "^15.1.1",
"yargs": "^16.2.0"
"yargs": "^16.2.0",
"cheerio": "^0.22.0"
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment