sup*_*rio 5 javascript web-scraping google-chrome-headless puppeteer
我是网络抓取的新手,并想使用puppeteer在网页上下载所有图像:
const puppeteer = require('puppeteer');
let scrape = async () => {
// Actual Scraping goes Here...
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
// Right click and save images
};
scrape().then((value) => {
console.log(value); // Success!
});
Run Code Online (Sandbox Code Playgroud)
我看过API了吗?文档,但不知道如何实现这一目标。因此,感谢您的帮助。
Gra*_*ler 11
您可以使用以下内容来抓取src页面上所有图像的所有属性的数组:
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
Run Code Online (Sandbox Code Playgroud)
然后您可以使用节点文件系统模块和HTTP或HTTPS 模块来下载每个图像。
完整示例:
'use strict';
const fs = require('fs');
const https = require('https');
const puppeteer = require('puppeteer');
/* ============================================================
Promise-Based Download Function
============================================================ */
const download = (url, destination) => new Promise((resolve, reject) => {
const file = fs.createWriteStream(destination);
https.get(url, response => {
response.pipe(file);
file.on('finish', () => {
file.close(resolve(true));
});
}).on('error', error => {
fs.unlink(destination);
reject(error.message);
});
});
/* ============================================================
Download All Images
============================================================ */
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
let result;
await page.goto('https://www.example.com/');
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
for (let i = 0; i < images.length; i++) {
result = await download(images[i], `image-${i}.png`);
if (result === true) {
console.log('Success:', images[i], 'has been downloaded successfully.');
} else {
console.log('Error:', images[i], 'was not downloaded.');
console.error(result);
}
}
await browser.close();
})();
Run Code Online (Sandbox Code Playgroud)
小智 5
我认为逻辑很简单。您只需要创建一个函数即可获取图像的url并将其保存到目录中。人偶只会抓取图片网址,并将其传递给下载程序功能。这是一个例子:
const puppeteer = require("puppeteer");
const fs = require("fs");
const request = require("request");
// This is main download function which takes the url of your image
function download(uri, filename, callback) {
request.head(uri, function(err, res, body) {
request(uri)
.pipe(fs.createWriteStream(filename))
.on("close", callback);
});
}
let scrape = async () => {
// Actual Scraping goes Here...
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://memeculture69.tumblr.com/");
await page.waitFor(1000);
const imageUrl = await page.evaluate(() =>
document.querySelector("img.image") // image selector
); // here we got the image url.
// Now just simply pass the image url to the downloader function to
download the image.
download(imageUrl, "image.png", function() {
console.log("Image downloaded");
});
};
scrape()
Run Code Online (Sandbox Code Playgroud)
这是另一个例子。转到Google中的常规搜索,然后下载左上方的Google图片。
const puppeteer = require('puppeteer');
const fs = require('fs');
async function run() {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.setViewport({ width: 1200, height: 1200 });
await page.goto('https://www.google.com/search?q=.net+core&rlz=1C1GGRV_enUS785US785&oq=.net+core&aqs=chrome..69i57j69i60l3j69i65j69i60.999j0j7&sourceid=chrome&ie=UTF-8');
const IMAGE_SELECTOR = '#tsf > div:nth-child(2) > div > div.logo > a > img';
let imageHref = await page.evaluate((sel) => {
return document.querySelector(sel).getAttribute('src').replace('/', '');
}, IMAGE_SELECTOR);
console.log("https://www.google.com/" + imageHref);
var viewSource = await page.goto("https://www.google.com/" + imageHref);
fs.writeFile(".googles-20th-birthday-us-5142672481189888-s.png", await viewSource.buffer(), function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
browser.close();
}
run();
Run Code Online (Sandbox Code Playgroud)
如果您有要下载的图像列表,则可以将选择器更改为根据需要以编程方式更改,然后在图像列表中选择一次以一次下载它们。
小智 5
如果要跳过手动的dom遍历,则可以直接从页面响应将图像写入磁盘。
例:
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', async response => {
const url = response.url();
if (response.request().resourceType() === 'image') {
response.buffer().then(file => {
const fileName = url.split('/').pop();
const filePath = path.resolve(__dirname, fileName);
const writeStream = fs.createWriteStream(filePath);
writeStream.write(file);
});
}
});
await page.goto('https://memeculture69.tumblr.com/');
await browser.close();
})();
Run Code Online (Sandbox Code Playgroud)
此代码将页面上找到的所有图像保存到图像文件夹中
page.on('response', async (response) => {
const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url());
if (matches && (matches.length === 2)) {
const extension = matches[1];
const buffer = await response.buffer();
fs.writeFileSync(`images/${matches[0]}.${extension}`, buffer, 'base64');
}
});
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
10064 次 |
| 最近记录: |