Tan*_*Tat 15 javascript node.js node-request
我使用Node.js的-异步及请求模块抓取100+百万计的网站,我不断碰到的错误ESOCKETTIMEDOUT及ETIMEDOUT几分钟后.
重新启动脚本后它再次起作用.它似乎不是连接限制问题,因为我仍然可以执行resolve4,resolveNs,resolveMx,也curl没有延迟.
你看到代码有什么问题吗?或任何建议?我想把async.queue()并发推高到至少1000.谢谢.
var request = require('request'),
async = require('async'),
mysql = require('mysql'),
dns = require('dns'),
url = require('url'),
cheerio = require('cheerio'),
iconv = require('iconv-lite'),
charset = require('charset'),
config = require('./spy.config'),
pool = mysql.createPool(config.db);
iconv.skipDecodeWarning = true;
var queue = async.queue(function (task, cb) {
dns.resolve4('www.' + task.domain, function (err, addresses) {
if (err) {
//
// Do something
//
setImmediate(function () {
cb()
});
} else {
request({
url: 'http://www.' + task.domain,
method: 'GET',
encoding: 'binary',
followRedirect: true,
pool: false,
pool: { maxSockets: 1000 },
timeout: 15000 // 15 sec
}, function (error, response, body) {
//console.info(task);
if (!error) {
// If ok, do something
} else {
// If not ok, do these
console.log(error);
// It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here.
// { [Error: ETIMEDOUT] code: 'ETIMEDOUT' }
// { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' }
var ns = [],
ip = [],
mx = [];
async.parallel([
function (callback) {
// Resolves the domain's name server records
dns.resolveNs(task.domain, function (err, addresses) {
if (!err) {
ns = addresses;
}
callback();
});
}, function (callback) {
// Resolves the domain's IPV4 addresses
dns.resolve4(task.domain, function (err, addresses) {
if (!err) {
ip = addresses;
}
callback();
});
}, function (callback) {
// Resolves the domain's MX records
dns.resolveMx(task.domain, function (err, addresses) {
if (!err) {
addresses.forEach(function (a) {
mx.push(a.exchange);
});
}
callback();
});
}
], function (err) {
if (err) return next(err);
// do something
});
}
setImmediate(function () {
cb()
});
});
}
});
}, 200);
// When the queue is emptied we want to check if we're done
queue.drain = function () {
setImmediate(function () {
checkDone()
});
};
function consoleLog(msg) {
//console.info(msg);
}
function checkDone() {
if (queue.length() == 0) {
setImmediate(function () {
crawlQueue()
});
} else {
console.log("checkDone() not zero");
}
}
function query(sql) {
pool.getConnection(function (err, connection) {
if (!err) {
//console.log(sql);
connection.query(sql, function (err, results) {
connection.release();
});
}
});
}
function crawlQueue() {
pool.getConnection(function (err, connection) {
if (!err) {
var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500";
connection.query(sql, function (err, results) {
if (!err) {
if (results.length) {
for (var i = 0, len = results.length; i < len; ++i) {
queue.push({"id": results[i]['id'], "domain": results[i]['domain'] });
}
} else {
process.exit();
}
connection.release();
} else {
connection.release();
setImmediate(function () {
crawlQueue()
});
}
});
} else {
setImmediate(function () {
crawlQueue()
});
}
});
}
setImmediate(function () {
crawlQueue()
});
Run Code Online (Sandbox Code Playgroud)
系统限制非常高.
Limit Soft Limit Hard Limit Units
Max cpu time unlimited unlimited seconds
Max file size unlimited unlimited bytes
Max data size unlimited unlimited bytes
Max stack size 8388608 unlimited bytes
Max core file size 0 unlimited bytes
Max resident set unlimited unlimited bytes
Max processes 257645 257645 processes
Max open files 500000 500000 files
Max locked memory 65536 65536 bytes
Max address space unlimited unlimited bytes
Max file locks unlimited unlimited locks
Max pending signals 257645 257645 signals
Max msgqueue size 819200 819200 bytes
Max nice priority 0 0
Max realtime priority 0 0
Max realtime timeout unlimited unlimited us
Run Code Online (Sandbox Code Playgroud)
的sysctl
net.ipv4.ip_local_port_range = 10000 61000
Run Code Online (Sandbox Code Playgroud)
Mot*_*tys 17
默认情况下,Node有4个工作人员来解析DNS查询.如果您的DNS查询需要很长时间,请求将阻止DNS阶段,并且症状正好ESOCKETTIMEDOUT或ETIMEDOUT.
尝试增加你的uv线程池大小:
export UV_THREADPOOL_SIZE=128
node ...
Run Code Online (Sandbox Code Playgroud)
或在index.js(或在你的切入点的任何地方):
#!/usr/bin/env node
process.env.UV_THREADPOOL_SIZE = 128;
function main() {
...
}
Run Code Online (Sandbox Code Playgroud)
我有同样的问题。阅读此讨论后,可以通过在request选项中使用“ agent:false”来解决此问题。
10/31/2017上面的原始回复似乎并未完全解决问题。我们找到的最终解决方案是在代理中使用keepAlive选项。例如:
var pool = new https.Agent({ keepAlive: true });
function getJsonOptions(_url) {
return {
url: _url,
method: 'GET',
agent: pool,
json: true
};
}
Run Code Online (Sandbox Code Playgroud)
节点的默认池似乎默认为keepAlive = false,这将导致在每个请求上创建一个新连接。如果在短时间内创建太多连接,则会出现上述错误。我的猜测是,沿着服务路径的一个或多个路由器可能会阻止连接请求,可能是因为怀疑“拒绝服务”攻击。无论如何,以上代码示例完全解决了我们的问题。
| 归档时间: |
|
| 查看次数: |
22601 次 |
| 最近记录: |