クローラーの写経してみた - フロントエンドデベロッパーのメモ

前からクローラー作ってみたいなって思ってたけど、なかなか余裕がないし設計アイデアが浮かばなかったからとりあえずググってみたらわかりやすいのが見つかった。 How to make a web crawler in JavaScript / Node.js とりあえず完璧に写経しただけやけど、非常にシンプルで考え方がわかりやすかった。ここからさらに外部リンクまで検索してくれるようにアップデートしてみたいな。

var request = require("request");
var cheerio = require("cheerio");
var URL = require("url-parse");

var START_URL = "http://www.arstechnica.com";
var SEARCH_WORD = "stemming";
var MAX_PAGES_TO_VISIT = 10;

var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;

pagesToVisit.push(START_URL);
crawl();

function crawl() {
  if (numPagesVisited >= MAX_PAGES_TO_VISIT) {
    console.log("Reached max limit of number of pages to visit.");
    return;
  }
  var nextPage = pagesToVisit.pop();
  if (nextPage in pagesVisited) {
    // We've already visited this page, so repeat the crawl
    console.log("Run the crawler again");
    crawl();
  } else {
    // new page we haven't visited
    console.log("pagesVisited: ", pagesVisited, nextPage);
    visitPage(nextPage, crawl);
  }
}

function visitPage(url, callback) {
  // add page to our set
  pagesVisited[url] = true;
  numPagesVisited++;
  // make the request
  console.log("candidates... ", pagesVisited);

  console.log("Visiting page " + url);
  request(url, function(error, response, body) {
    // check status code (200 is HTTP OK)
    console.log("Status code : " + response.statusCode);
    if (response.statusCode !== 200) {
      callback();
      return;
    }
    // Parse the document body
    var $ = cheerio.load(body);
    var isWordFound = searchForWord($, SEARCH_WORD);
    if (isWordFound) {
      console.log("Word " + SEARCH_WORD + " found at page " + url);
    } else {
      collectInternalLinks($);
      // in this short program, our callback is just calling crawl();
      callback();
    }
  });
}

function searchForWord($, word) {
  var bodyText = $("html > body").text();
  return bodyText.indexOf(word.toLowerCase()) !== -1;
}

// search internal links
function collectInternalLinks($) {
  var relativeLinks = $("a[href^='/']");
  console.log("Found " + relativeLinks.length + " relative links");
  relativeLinks.each(function() {
    pagesToVisit.push(baseUrl + $(this).attr("href"));
  });
}