kar*_*olo 14 javascript foreach loops web-scraping casperjs
我有一个CasperJS脚本,可以从一个网页上删除评级和日期.现在我想从同一网站下的多个页面中抓取相同的数据.如何在给定此代码的情况下遍历不同的子页面:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){
this.echo('hi');
});
casper.then(function() {
ratings = this.evaluate(getRatings);
dates = this.evaluate(getDate);
this.echo(ratings);
});
casper.run(function() {
this.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
this.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w');
this.echo(dates.length + ' dates found:').exit();
});
Run Code Online (Sandbox Code Playgroud)
任何帮助表示赞赏:)
Art*_* B. 28
由于存在下一页按钮,您可以使用它来递归遍历所有页面:
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
casper.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');
casper.then(getRatingsAndWrite);
casper.run();
Run Code Online (Sandbox Code Playgroud)
一个相关的答案是A:CasperJS在点击按钮后解析下一页.
这段代码可以帮助您:在一个对象数组中定义所需的URL,每个页面的选择器,并在循环中执行您想要对这些属性执行的操作.
您可以在循环中使用click方法而不是url.
var navigation = [
{
url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
,{
url: 'yourSecondUrl, etc...',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
],
content = "";
casper.start()
.then(function(){
//loop on the array
navigation.forEach(function(navIndex){
//open url : property url
casper.thenOpen(navIndex.url)
//wait for the page to load -> must be useless because thenOpen() do it
.waitForUrl(navIndex.url, function(){
//get the value of attribute title of adequate selector
var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
//get the HTML of adequate selector
var dates = this.getHTML(navIndex.selectorDates);
this.echo(ratings);
this.echo(dates);
content = content + ' ' + ratings + ' ' + dates;
});
});
})
.run(function() {
this.echo('----------- All steps done ------------\n');
this.exit();
});
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
12916 次 |
最近记录: |