joh*_*ual 14 javascript beautifulsoup phantomjs casperjs
可以PhantomJS使用的替代BeautifulSoup?
我正在尝试搜索Etsy并访问所有链接.在Python中,我知道如何做到这一点(使用BeautifulSoup)但今天我想知道我是否可以使用PhantomJS做同样的事情.我没有走得太远.
该脚本应在Etsy上搜索"hello kitty"并返回所有产品
<a class="listing-thumb" href=...></a>并在控制台中打印.理想情况下,我稍后会访问它们并获取我需要的信息.现在它只是冻结了.有任何想法吗?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
Run Code Online (Sandbox Code Playgroud)
我玩过CasperJS玩具,可能更适合这个.
NiK*_*iKo 35
PhantomJS evaluate()无法序列化并返回复杂对象,如HTMLElements或NodeLists,因此您必须将它们映射到可序列化的东西:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Run Code Online (Sandbox Code Playgroud)
注意:这里我们使用[].map.call()以便将a NodeList视为标准Array.
您的代码的唯一问题是您不了解 phantomjs 范围。您有幻像范围和页面范围。您尝试将 JavaScript DOM 对象引用(无法序列化)从页面范围(page.evaluate 在页面范围中运行)返回到幻像主范围。我认为这是不可能的。下面是有效的代码:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);
Run Code Online (Sandbox Code Playgroud)