Casperjs使用casper.each迭代链接列表

Lau*_*nce 9 javascript phantomjs casperjs

我正在尝试使用Casperjs从页面获取链接列表,然后打开每个链接,并从这些页面向数组对象添加特定类型的数据.

我遇到的问题是在每个列表项上执行的循环.

首先,我listOfLinks从原始页面获得了一个.这部分工作和使用长度我可以检查这个列表是否已填充.

但是,使用如下的循环语句this.each,没有任何控制台语句出现,并且casperjs似乎跳过此块.

this.each用标准for循环替换,执行只在第一个链接的一部分,因为语句"在对象中为x.html创建新数组"出现一次,然后代码停止执行.使用IIFE不会改变这种情况.

编辑:在详细调试模式下,会发生以下情况:

Creating new array object for https://example.com 
[debug] [phantom] Navigation requested: url=about:blank, type=Other, willNavigate=true, isMainFrame=true
Run Code Online (Sandbox Code Playgroud)

因此,由于某种原因,传递给thenOpen函数的URL变为空白...

我觉得有一些关于Casperjs的异步性质的东西,我在这里并没有把握,并且很感激被指向一个有效的例子.

casper.then(function () {

  var date = Date.now();
  console.log(date);

  var object = {};
  object[date] = {}; // new object for date

  var listOfLinks = this.evaluate(function(){
    console.log("getting links");
    return document.getElementsByClassName('importantLink');
  });

  console.log(listOfLinks.length);

  this.each(listOfLinks, function(self, link) {

    var eachPageHref = link.href;

    console.log("Creating new array in object for " + eachPageHref);

    object[date][eachPageHref] = []; // array for page to store names

    self.thenOpen(eachPageHref, function () {

      var listOfItems = this.evaluate(function() {
        var items = [];
        // Perform DOM manipulation to get items
        return items;
      });
    });

    object[date][eachPageHref] = items;

  });
  console.log(JSON.stringify(object));

});
Run Code Online (Sandbox Code Playgroud)

Vav*_*off 4

我决定使用我们自己的 Stackoverflow.com 作为演示站点来运行您的脚本。我在您的代码中纠正了一些小问题,结果就是从 PhantomJS 赏金问题中获取评论的练习。

\n\n
var casper = require(\'casper\').create();\n\ncasper\n.start()\n.open(\'http://stackoverflow.com/questions/tagged/phantomjs?sort=featured&pageSize=30\')\n.then(function () {\n\n    var date = Date.now(), object = {};\n    object[date] = {};\n\n    var listOfLinks = this.evaluate(function(){\n\n        // Getting links to other pages to scrape, this will be \n        // a primitive array that will be easily returned from page.evaluate\n        var links = [].map.call(document.querySelectorAll("#questions .question-hyperlink"), function(link) {\n          return link.href;\n        });    \n        return links;\n    });\n\n    // Now to iterate over that array of links\n    this.each(listOfLinks, function(self, eachPageHref) {\n\n        object[date][eachPageHref] = []; // array for page to store names\n\n        self.thenOpen(eachPageHref, function () {\n\n            // Getting comments from each page, also as an array\n            var listOfItems = this.evaluate(function() {\n                var items = [].map.call(document.getElementsByClassName("comment-text"), function(comment) {\n                    return comment.innerText;\n                });    \n                return items;\n            });\n            object[date][eachPageHref] = listOfItems;\n        });\n    });\n\n    // After each links has been scraped, output the resulting object\n    this.then(function(){\n        console.log(JSON.stringify(object));\n    });\n})\n\ncasper.run();\n
Run Code Online (Sandbox Code Playgroud)\n\n

更改内容:page.evaluate现在返回简单数组,这是 casper.each() 正确迭代所需的。href属性立即在 page.evaluate 中提取。还有这个修正:

\n\n
 object[date][eachPageHref] = listOfItems; // previously assigned items which were undefined in this scope\n
Run Code Online (Sandbox Code Playgroud)\n\n

脚本运行的结果是

\n\n
{"1478596579898":{"http://stackoverflow.com/questions/40410927/phantomjs-from-node-on-windows":["en.wikipedia.org/wiki/File_URI_scheme \xe2\x80\x93\xc2\xa0Igor 2 days ago\\n","@Igor is there something in particular you see wrong, or are you suggesting the phantom module has an incorrect URI? \xe2\x80\x93\xc2\xa0Danny Buonocore 2 days ago\\n","Probably windows security issue not allowing to run an unsigned program. \xe2\x80\x93\xc2\xa0Vaviloff yesterday\\n"],"http://stackoverflow.com/questions/40412726/casperjs-iterating-over-a-list-of-links-using-casper-each":["Thanks, this looked really promising. I made the changes but it didn\'t solve the problem. And I just realised that in debug mode the following happens: Creating new array object for https://example.com [debug] [phantom] Navigation requested: url=about:blank, type=Other, willNavigate=true, isMainFrame=true and then Casperjs silently fails. It seems that the correct link that gets passed into thenOpen gets changed to about:blank... \xe2\x80\x93\xc2\xa0cyc665 yesterday\\n"]}}\n
Run Code Online (Sandbox Code Playgroud)\n