stk*_*flw 7 node.js web-scraping x-ray
以下脚本在我的NodeJS服务器中运行得很好,但是当我试图刮掉一些西里尔网站时,它很少会返回这样的响应.
脚本
x(url, {
name: 'title',
ogDescription: 'meta[property="og:description"]@content',
metaDescription: 'meta[name="description"]@content',
ogImage: 'meta[property="og:image"]@content',
twitterImage: 'meta[name="name="twitter:image:src""]@content',
metaImage: 'meta[name="image"]@content',
headImage: 'head img@src',
contentImage_1: '.content img@src',
contentImage_2: '.image img@src'
})
(function (err, obj) {
var firstData = {
name: [
obj.name
],
description: [
obj.metaDescription,
obj.ogDescription,
],
image: [
obj.ogImage,
obj.twitterImage,
obj.metaImage,
obj.headImage,
obj.contentImage_1,
obj.contentImage_2
]
}
Run Code Online (Sandbox Code Playgroud)
编码错误的响应示例
firstData { name: [ '(Rock, Pop) [15LP] [24/96] Queen - Studio Collection - 2015,
FLAC (tracks) :: RuTracker.org' ],
description:
[ 'RuTracker.org » ?????????? ??? (??????????? ?????????) »
??????? ??????? (Rock, Pop) [15LP] [24/96] Queen -
Studio Collection - 2015, FLAC (tracks)',
undefined ],
image: [ undefined, undefined, undefined, undefined, undefined, undefined ] }
Run Code Online (Sandbox Code Playgroud)
我该如何解决?
您可以使用 request 作为 x-ray 的驱动程序并 iconv 其中的正文,如下所示:
var options = {};
var conv = null;
options.encoding = 'binary';
iconv = new require('iconv').Iconv('Windows-1251', 'utf8');
conv = function(body) {
if (!body) return body;
body = new Buffer.from(body, 'binary');
return iconv.convert(body).toString();
}
var request = require('request').defaults(options);
var driver = function driver(context, callback) {
var url = context.url;
request(url, function(err, response, body) {
if (!err && conv) body = conv(body);
return callback(err, body);
})
};
x.driver(driver);
x(url, {
name: 'title',
ogDescription: 'meta[property="og:description"]@content',
metaDescription: 'meta[name="description"]@content',
ogImage: 'meta[property="og:image"]@content',
twitterImage: 'meta[name="name="twitter:image:src""]@content',
metaImage: 'meta[name="image"]@content',
headImage: 'head img@src',
contentImage_1: '.content img@src',
contentImage_2: '.image img@src'
})
(function (err, obj) {
var firstData = {
name: [
obj.name
],
description: [
obj.metaDescription,
obj.ogDescription,
],
image: [
obj.ogImage,
obj.twitterImage,
obj.metaImage,
obj.headImage,
obj.contentImage_1,
obj.contentImage_2
]
}
console.log(firstData);
});
Run Code Online (Sandbox Code Playgroud)