add*_*ddy 4 javascript node.js puppeteer
下面是我尝试用于 Google 网络抓取的代码。当我传递特定请求时,它不会返回链接列表。我不明白是什么原因造成的。有人可以帮忙吗?
const puppeteer = require("puppeteer");
const searchGoogle = async (searchQuery) => {
/** by default puppeteer launch method have headless option true*/
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.goto("https://www.google.com/");
await page.type('input[aria-label="Search"]', searchQuery);
await page.keyboard.press("Enter");
/** waitfor while loding the page, otherwise evaulate method will get failed. */
await page.waitFor(5000);
const list = await page.evaluate(() => {
let data = [];
/** this can be changed for other website.*/
const list = document.querySelectorAll(".rc .r");
for (const a of list) {
data.push({
title: a
.querySelector(".LC20lb")
.innerText.trim()
.replace(/(\r\n|\n|\r)/gm, " "),
link: a.querySelector("a").href,
});
}
return data;
});
await browser.close();
};
module.exports = searchGoogle;
Run Code Online (Sandbox Code Playgroud)
await page.waitFor(5000);
在这种情况下会导致竞争条件。如果页面在 5 秒内未加载,您可能会得到漏报。如果页面加载速度超过 5 秒,那么您就无缘无故地浪费了时间。仅选择任意延迟作为最后的手段,或者它是应用程序逻辑的预期部分。
更好的方法是使用page.waitForSelector
or page.waitForNavigation
。
其次,我没有看到选择器的结果.rc .r
。我不确定 Google 的 CSS 选择器有多稳定,但.LC20lb
目前粗略地看似乎是安全的。
把它放在一起给出:
const puppeteer = require("puppeteer"); // ^19.6.3
let browser;
(async () => {
const searchQuery = "stack overflow";
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setRequestInterception(true);
page.on("request", request => {
request.resourceType() === "document" ?
request.continue() : request.abort();
});
await page.goto("https://www.google.com/", {waitUntil: "domcontentloaded"});
await page.waitForSelector('input[aria-label="Search"]', {visible: true});
await page.type('input[aria-label="Search"]', searchQuery);
await Promise.all([
page.waitForNavigation({waitUntil: "domcontentloaded"}),
page.keyboard.press("Enter"),
]);
await page.waitForSelector(".LC20lb", {visible: true});
const searchResults = await page.$$eval(".LC20lb", els =>
els.map(e => ({title: e.innerText, link: e.parentNode.href}))
);
console.log(searchResults);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Run Code Online (Sandbox Code Playgroud)
输出(您的输出可能会有所不同,具体取决于运行脚本时 Google 显示的内容):
[
{
title: 'Stack Overflow - Where Developers Learn, Share, & Build ...',
link: 'https://stackoverflow.com/'
},
{
title: 'Stack Overflow - Wikipedia',
link: 'https://en.wikipedia.org/wiki/Stack_Overflow'
},
{
title: 'Stack Overflow Blog - Essays, opinions, and advice on the act ...',
link: 'https://stackoverflow.blog/'
},
{
title: 'The Stack Overflow Podcast - Stack Overflow Blog',
link: 'https://stackoverflow.blog/podcast/'
},
{
title: 'Stack Overflow | LinkedIn',
link: 'https://www.linkedin.com/company/stack-overflow'
}
]
Run Code Online (Sandbox Code Playgroud)
另一种方法是将搜索词编码为 URL 查询参数并直接导航到https://www.google.com/search?q=your+query+here
,从而避免导航和潜在的选择器错误。
与许多抓取任务一样,由于目标是从文档中获取简单的 href,因此您可以尝试切换到fetch
/cheerio
并使用静态 HTML。在我的机器上,以下脚本的运行速度比具有两次导航的 Puppeteer 快约 5 倍,比直接导航到搜索结果的 Puppeteer 快约 3 倍。
const cheerio = require("cheerio"); // 1.0.0-rc.12
const query = "stack overflow";
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
fetch(url, { // Node 18 or install node-fetch
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
}
})
.then(res => res.text())
.then(html => {
const $ = cheerio.load(html);
const searchResults = [...$(".LC20lb")].map(e => ({
title: $(e).text().trim(),
link: e.parentNode.attribs.href,
}));
console.log(searchResults);
});
Run Code Online (Sandbox Code Playgroud)
另请参阅使用 Puppeteer 单击第一个 Google 搜索结果上的元素。
归档时间: |
|
查看次数: |
6429 次 |
最近记录: |