ama*_*ary 6 javascript node.js puppeteer
我正在使用puppeteer-extra和 node.js 遍历多个 url。
我试图拦截一些资源类型以在每次迭代时加载,并收到以下错误。
PS C:\Users\someuser\Desktop\Project> node temp.js
-- running
C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\assert.js:26
throw new Error(message);
^
Error: Request is already handled!
at Object.exports.assert (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\assert.js:26:15)
at HTTPRequest.continue (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\HTTPRequest.js:217:21)
at PuppeteerBlocker.onRequest (C:\Users\someuser\node_modules\@cliqz\adblocker-puppeteer\dist\cjs\adblocker.js:225:33)
at BlockingContext.onRequest (C:\Users\someuser\node_modules\@cliqz\adblocker-puppeteer\dist\cjs\adblocker.js:64:47)
at C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:62
at Array.map (<anonymous>)
at Object.emit (C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:43)
at Page.emit (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\EventEmitter.js:72:22)
at C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\Page.js:143:100
at C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:62
Run Code Online (Sandbox Code Playgroud)
我无法理解为什么page.goto在for循环中完成实际请求时会已经处理该请求。有人会有任何提示吗?
这是完整的项目
const puppeteer = require( 'puppeteer-extra' );
const StealthPlugin = require( 'puppeteer-extra-plugin-stealth' );
puppeteer.use( StealthPlugin() );
const AdblockerPlugin = require( 'puppeteer-extra-plugin-adblocker' );
puppeteer.use( AdblockerPlugin( { blockTrackers: true } ) );
puppeteer.launch( { headless: true } ).then( async browser => {
console.log( '--\xa0running' );
console.time( '--\xa0process' );
const page = await browser.newPage();
await page.setRequestInterception( true );
page.on( 'request', ( request ) => {
if ( [ 'image', 'stylesheet', 'font', 'script' ].indexOf( request.resourceType() ) ) {
request.abort();
} else {
request.continue();
};
} );
for ( var i = 1; i <= 20; i++ ) {
console.time( '--\xa0iteration\xa0' + i ); // ... timer start
await page.goto( 'https://www.someurl.it/shop/s%2D' + i, { waitUntil: 'load' } );
const title = await page.title();
console.log( title.includes( '404' ) ? false : title );
console.timeEnd( '--\xa0iteration\xa0' + i ); // ... timer end
};
await browser.close();
console.timeEnd( '--\xa0process' );
console.log( '--\xa0ending' );
} );
Run Code Online (Sandbox Code Playgroud)
小智 5
添加返回语句解决了我的问题。
page.on( 'request', ( request ) => {
if ([ 'image', 'stylesheet', 'font', 'script' ].indexOf( request.resourceType() ) !== -1 ) {
return request.abort();
}
request.continue();
} );
Run Code Online (Sandbox Code Playgroud)
必须针对每个新页面进行资源拦截。
以下是您可以拦截的资源的完整列表:stylesheet, image, media, font, script, texttrack, xhr, fetch, eventsource, websocket, manifest, other。
注意:
大多数时候,拦截所有资源可能会对您的抓取工具产生负面影响。
我建议仅拦截image,media和font。(在某些情况下,拦截stylesheet可能会影响操纵者的点击操作)。
/**
* Puppeteer, Headless Chrome Node.js API
*
* @link https://github.com/puppeteer/puppeteer
*
* @package npm install puppeteer
*/
const puppeteer = require( 'puppeteer' );
const brewery = async ( page ) => {
await page.setRequestInterception( true );
page.on( 'request', r => {
/**
* @see https://stackoverflow.com/a/47166637/3645650
*/
if ( [
//'stylesheet',
'image',
'media',
'font',
].indexOf( r.resourceType() ) !== -1 ) {
r.abort();
} else {
r.continue();
};
} );
};
( async () => {
// ... start
let start = new Date();
console.log( '--\xa0process:\xa0start' );
const browser = await puppeteer.launch( {
headless: true
} );
const page = await browser.newPage();
await brewery( page );
await page.goto( 'https://github.com/login' );
await page.screenshot( { path: Date.now() + '.png' } );
console.log( '--\xa0process:\xa0screenshot' );
// ... end
await browser.close().then( () => {
var end = ( new Date() - start ) / 1000;
console.log( '--\xa0process:\xa0end,\xa0runtime\xa0' + end + '\xa0seconds' );
} );
} ) ()
Run Code Online (Sandbox Code Playgroud)