可以使用chrome无头浏览器来保存包含资源的完整网页吗?

uda*_*ran 11 google-chrome headless headless-browser google-chrome-headless

我希望使用chrome无头浏览器复制"另存为"功能,以保存包含所有资源的完整网页.有没有办法做到这一点?我尝试了--print-to-pdf和--screenshot选项,并期待探索"另存为"选项.

小智 1

这是完全可能的,但并不容易。你必须自己承担繁重的工作。这意味着:

  1. 将所有链接的资源保存到本地目录。
  2. 将这些资产的所有链接重写为相对链接。
  3. 将重写的 HTML 文件保存到同一本地目录。

这是使用Playwright 的示例。(请注意,此代码是从现有项目中剪切并针对此答案进行清理的。它可能无法完美运行。)

const { webkit } = require('playwright');
const { parse } = require('node-html-parser');
const fs = require('fs-extra');
const path = require('path');
const url = require('url');

// Save the webpage and its assets to a local directory
async function saveWebpage(urlToSave, outputDir) {
  // Launch a new browser instance
  const browser = await webkit.launch();
  const context = await browser.newContext();
  const page = await context.newPage();

  // Navigate to the specified URL
  await page.goto(urlToSave);
  const html = await page.content();

  // Parse the HTML content
  const parsedHtml = parse(html);
  const baseTag = parsedHtml.querySelector('base');
  const baseUrl = baseTag ? baseTag.getAttribute('href') : urlToSave;

  const assetUrls = new Set();
  const assetDownloadPromises = [];

  // Fetch the asset and return its content as a buffer
  async function fetchAsset(originalUrl) {
    try {
      const assetPage = await context.newPage();
      const response = await assetPage.goto(originalUrl, { waitUntil: 'networkidle' });
      const buffer = await response.buffer();
      return buffer;
    } catch (error) {
      console.error(`Error fetching asset: ${originalUrl} - ${error.message}`);
    }
  }

  // Process the specified attribute to update the links and fetch the assets
  function processAttribute(attributeName) {
    for (const element of parsedHtml.querySelectorAll(`[${attributeName}]`)) {
      const originalUrl = element.getAttribute(attributeName);
      if (originalUrl.startsWith('data:')) continue;

      const absoluteUrl = url.resolve(baseUrl, originalUrl);
      const parsedUrl = url.parse(absoluteUrl);
      const relativePath = path.join(parsedUrl.host || '', parsedUrl.pathname);
      const localPath = path.join(outputDir, relativePath);

      element.setAttribute(attributeName, relativePath);

      if (!assetUrls.has(absoluteUrl)) {
        assetUrls.add(absoluteUrl);
        assetDownloadPromises.push(
          fetchAsset(absoluteUrl)
            .then((buffer) => buffer && fs.outputFile(localPath, buffer))
        );
      }
    }
  }

  // Process 'src' and 'href' attributes to update links and download assets
  processAttribute('src');
  processAttribute('href');

  // Save the updated HTML content
  await fs.outputFile(path.join(outputDir, 'index.html'), parsedHtml.toString());

  // Wait for all assets to be downloaded
  await Promise.allSettled(assetDownloadPromises);

  // Close the browser instance
  await browser.close();
}

const urlToSave = 'https://example.com/';
const outputDir = 'saved-website';

saveWebpage(urlToSave, outputDir).catch((error) => console.error('Error:', error));
Run Code Online (Sandbox Code Playgroud)