Al *_* R. 7 node.js google-chrome-headless puppeteer
我有一个简单的node.js脚本来捕获一些网页的屏幕截图。看来我在使用async / await的过程中被绊倒了,但我不知道在哪里。我目前正在使用puppeteer v1.11.0。
const puppeteer = require('puppeteer');
//a list of sites to screenshot
const papers =
{
nytimes: "https://www.nytimes.com/",
wapo: "https://www.washingtonpost.com/"
};
//launch puppeteer, do everything in .then() handler
puppeteer.launch({devtools:false}).then(function(browser){
//create a load_page function that returns a promise which resolves when screenshot is taken
async function load_page(paper){
const url = papers[paper];
return new Promise(async function(resolve, reject){
const page = await browser.newPage();
await page.setViewport({width:1024, height: 768});
//screenshot on first console message
page.once("console", async console_msg => {
await page.pdf({path: paper + '.pdf',
printBackground:true,
width:'1024px',
height:'768px',
margin: {top:"0px", right:"0px", bottom:"0px", left:"0px"}
});
//close page
await page.close();
//resolve promise
resolve();
});
//go to page
await page.goto(url, {"waitUntil":["load", "networkidle0"]});
})
}
//step through the list of papers, calling the above load_page()
async function stepThru(){
for(var p in papers){
if(papers.hasOwnProperty(p)){
//wait to load page and screenshot before loading next page
await load_page(p);
}
}
//close browser after loop has finished (and all promises resolved)
await browser.close();
}
//kick it off
stepThru();
//getting this error message:
//UnhandledPromiseRejectionWarning: Error: Navigation failed because browser has disconnected!
});
Run Code Online (Sandbox Code Playgroud)
该Navigation failed because browser has disconnected错误通常意味着启动Puppeteer的节点脚本结束而无需等待Puppeteer操作完成。因此,正如您所说的,这是一个等待的问题。
关于您的脚本,我进行了一些更改以使其起作用:
1-首先,您不等待功能的(异步)结束,stepThru因此请进行更改
stepThru();
Run Code Online (Sandbox Code Playgroud)
至
await stepThru();
Run Code Online (Sandbox Code Playgroud)
和
puppeteer.launch({devtools:false}).then(function(browser){
Run Code Online (Sandbox Code Playgroud)
至
puppeteer.launch({devtools:false}).then(async function(browser){
Run Code Online (Sandbox Code Playgroud)
(我加了async)
2-我更改了您管理goto和pagce.once承诺的方式
PDF许诺现在
new Promise(async function(resolve, reject){
//screenshot on first console message
page.once("console", async () => {
await page.pdf({path: paper + '.pdf', printBackground:true, width:'1024px', height:'768px', margin: {top:"0px", right:"0px", bottom:"0px", left:"0px"} });
resolve();
});
})
Run Code Online (Sandbox Code Playgroud)
它只负责PDF的创建。
3-然后我page.goto通过Promise.all
await Promise.all([
page.goto(url, {"waitUntil":["load", "networkidle2"]}),
new Promise(async function(resolve, reject){
// ... pdf creation as above
})
]);
Run Code Online (Sandbox Code Playgroud)
4-我搬了page.close之后Promise.all
await Promise.all([
// page.goto
// PDF creation
]);
await page.close();
resolve();
Run Code Online (Sandbox Code Playgroud)
现在它可以工作了,这里是完整的工作脚本
const puppeteer = require('puppeteer');
//a list of sites to screenshot
const papers =
{
nytimes: "https://www.nytimes.com/",
wapo: "https://www.washingtonpost.com/"
};
//launch puppeteer, do everything in .then() handler
puppeteer.launch({devtools:false}).then(async function(browser){
//create a load_page function that returns a promise which resolves when screenshot is taken
async function load_page(paper){
const url = papers[paper];
return new Promise(async function(resolve, reject){
const page = await browser.newPage();
await page.setViewport({width:1024, height: 768});
await Promise.all([
page.goto(url, {"waitUntil":["load", "networkidle2"]}),
new Promise(async function(resolve, reject){
//screenshot on first console message
page.once("console", async () => {
await page.pdf({path: paper + '.pdf', printBackground:true, width:'1024px', height:'768px', margin: {top:"0px", right:"0px", bottom:"0px", left:"0px"} });
resolve();
});
})
]);
await page.close();
resolve();
})
}
//step through the list of papers, calling the above load_page()
async function stepThru(){
for(var p in papers){
if(papers.hasOwnProperty(p)){
//wait to load page and screenshot before loading next page
await load_page(p);
}
}
await browser.close();
}
await stepThru();
});
Run Code Online (Sandbox Code Playgroud)
请注意:-我更改networkidle0为,networkidle2因为nytimes.com网站花费很长时间才能获得0网络请求状态(由于AD等)。您networkidle0显然可以等待,但这取决于您,这超出了您的问题范围(page.goto在这种情况下,增加了超时时间)- www.washingtonpost.com网站出现TOO_MANY_REDIRECTS错误,因此我更改为,washingtonpost.com但我认为您应该对此进行更多调查。为了测试脚本,我在nytimes网站和其他网站上使用了更多次。再说一次:这超出了您的问题范围
让我知道您是否需要更多帮助
| 归档时间: |
|
| 查看次数: |
6123 次 |
| 最近记录: |