uda*_*ran 11 google-chrome headless headless-browser google-chrome-headless
我希望使用chrome无头浏览器复制"另存为"功能,以保存包含所有资源的完整网页.有没有办法做到这一点?我尝试了--print-to-pdf和--screenshot选项,并期待探索"另存为"选项.
小智 1
这是完全可能的,但并不容易。你必须自己承担繁重的工作。这意味着:
这是使用Playwright 的示例。(请注意,此代码是从现有项目中剪切并针对此答案进行清理的。它可能无法完美运行。)
const { webkit } = require('playwright');
const { parse } = require('node-html-parser');
const fs = require('fs-extra');
const path = require('path');
const url = require('url');
// Save the webpage and its assets to a local directory
async function saveWebpage(urlToSave, outputDir) {
// Launch a new browser instance
const browser = await webkit.launch();
const context = await browser.newContext();
const page = await context.newPage();
// Navigate to the specified URL
await page.goto(urlToSave);
const html = await page.content();
// Parse the HTML content
const parsedHtml = parse(html);
const baseTag = parsedHtml.querySelector('base');
const baseUrl = baseTag ? baseTag.getAttribute('href') : urlToSave;
const assetUrls = new Set();
const assetDownloadPromises = [];
// Fetch the asset and return its content as a buffer
async function fetchAsset(originalUrl) {
try {
const assetPage = await context.newPage();
const response = await assetPage.goto(originalUrl, { waitUntil: 'networkidle' });
const buffer = await response.buffer();
return buffer;
} catch (error) {
console.error(`Error fetching asset: ${originalUrl} - ${error.message}`);
}
}
// Process the specified attribute to update the links and fetch the assets
function processAttribute(attributeName) {
for (const element of parsedHtml.querySelectorAll(`[${attributeName}]`)) {
const originalUrl = element.getAttribute(attributeName);
if (originalUrl.startsWith('data:')) continue;
const absoluteUrl = url.resolve(baseUrl, originalUrl);
const parsedUrl = url.parse(absoluteUrl);
const relativePath = path.join(parsedUrl.host || '', parsedUrl.pathname);
const localPath = path.join(outputDir, relativePath);
element.setAttribute(attributeName, relativePath);
if (!assetUrls.has(absoluteUrl)) {
assetUrls.add(absoluteUrl);
assetDownloadPromises.push(
fetchAsset(absoluteUrl)
.then((buffer) => buffer && fs.outputFile(localPath, buffer))
);
}
}
}
// Process 'src' and 'href' attributes to update links and download assets
processAttribute('src');
processAttribute('href');
// Save the updated HTML content
await fs.outputFile(path.join(outputDir, 'index.html'), parsedHtml.toString());
// Wait for all assets to be downloaded
await Promise.allSettled(assetDownloadPromises);
// Close the browser instance
await browser.close();
}
const urlToSave = 'https://example.com/';
const outputDir = 'saved-website';
saveWebpage(urlToSave, outputDir).catch((error) => console.error('Error:', error));
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
450 次 |
| 最近记录: |