Puppeteer 因生成大型 pdf 而挂起

bmi*_*rek 5 pdf chromium node.js google-chrome-headless puppeteer

我需要使用 puppeteer/chromium 生成 2000 多个页面,从 html 到 pdf。

目前,我有以下配置:

浏览器.js:

const p = require("puppeteer");

const isLinux = process.platform === "linux";
const LINUX_CHROMIUM = "/usr/bin/chromium-browser";
const WINDOWS_CHROME = `C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe`;

module.exports = async ({ port, host }) => {
    const options = isLinux
        ? {
            headless: true,
            executablePath: LINUX_CHROMIUM,
            args: [
                "--no-sandbox",
                "--disable-gpu",
                "--window-size=1200,1200",
                "--disable-dev-shm-usage",
                "--unlimited-storage",
                "--full-memory-crash-report"
            ],
            userDataDir: "/usr/cache",
        }
        : {
            headless: true,
            executablePath: WINDOWS_CHROME,
            args: [
                "--window-size=1200,1200",
                "--disable-dev-shm-usage",
                "--unlimited-storage",
                "--full-memory-crash-report"
            ],
        };

    return await p.launch(options);
}
Run Code Online (Sandbox Code Playgroud)

服务器.js:

const express = require("express");
const uuid = require("uuid");
const fs = require("fs");
const path = require("path");

module.exports = async function start({ browser = null, port = 80 } = {}) {
    if (!browser) {
        throw new Error(`no browser`);
    }

    try {
        const page = await browser.newPage();
        page.close();
    } catch {
        throw new Error(`browser not working`);
    }

    const pdfFolder = path.resolve(__dirname, "./pdf");
    fs.existsSync(pdfFolder) || fs.mkdirSync(pdfFolder);

    const app = express();
    app.use(require("body-parser").text({limit: '60mb', extended: true}));
    app.use('/static', express.static(path.join(__dirname, 'static')))

    app.post("/print", async (req, res) => {
        try {
            const id = uuid.v1();
            console.time(`${id} print`);
            const html = req.body;

            const page = await browser.newPage();
            await page.setDefaultNavigationTimeout(0);
            console.time(`${id} goto page`);
            await  page.setContent(html, { waitUntil: "networkidle2" });
            console.timeEnd(`${id} goto page`);
            console.time(`${id} pdf`);
            await page.pdf({ path: path.resolve(pdfFolder, `${id}.pdf`), format: "A4" });
            console.timeEnd(`${id} pdf`);
            await page.close();

            console.timeEnd(`${id} print`);
            res.redirect(`/pdf/${id}.pdf`);
        } catch (error) {
            res.status(500);
            res.json({ message: error.message, stack: error.stack });
        }
    });

    app.get("/pdf/:file", (req, res) => {
        const id = req.params.file;
        const file = path.resolve(pdfFolder, id);
        if (fs.existsSync(file)) {
            console.time(`${id} download`);
            res.status(200);
            res.download(file);
            console.timeEnd(`${id} download`);
            res.once("finish", () => fs.unlinkSync(file));
        } else {
            res.status(404);
            res.json({ message: `no file found` });
        }
    });

    console.log(`starting server on port ${port}`);

    return app.listen(port);
};
Run Code Online (Sandbox Code Playgroud)

初始化.js:

const yargs = require("yargs");

const createBrowser = require("./browser");
const createServer = require("./server");

(async function init() {
    const { chromeHost, chromePort, serverPort } = yargs
        .string("chromeHost")
        .number("chromePort")
        .number("serverPort")
        .default("chromeHost", "127.0.0.1")
        .default("chromePort", 9222)
        .default("serverPort", 80)
        .argv;

    const browser = await createBrowser({ host: chromeHost, port: chromePort });
    const server = await createServer({ browser, port: serverPort });

    process.on("beforeExit", () => {
        browser.close();
        server.close();
    })

})();
Run Code Online (Sandbox Code Playgroud)

对于大约 650 页的 pdf,生成需要 3 分钟(html 输入:14.5 MB,setContent 需要 30 秒,pdf 需要 2 分 30 秒)。我需要生成最多 2000 页的 pdf 文件,但 puppeteer/chromium 挂起。我还需要提高 650 页 pdf 的性能... 3 分钟太长了。它正在docker中运行。

应调整哪个配置内存/CPU 以获得更好的性能?它可以在云端运行(目前在本地机器或VPS上)。

我看到使用 Google Puppeteer 将大文件的 HTML 转换为 PDF,但没有加速 pdf 生成的解决方案。