rin*_*mon 24 javascript node.js puppeteer
google发现我的浏览器是manipulated/controlled/automated由软件驱动的,因此我得到了reCaptcha. 当我手动启动铬并执行相同的步骤时,不会出现 reCaptcha。
Question 1)
是否可以以编程方式解决验证码或在使用时摆脱它puppeteer?有什么办法可以解决这个问题?
Question 2)
这是否仅在没有headless选项时发生i.e
const browser = await puppeteer.launch({
headless: false
})
Run Code Online (Sandbox Code Playgroud)
或者这是我们必须接受并继续前进的事实?
rin*_*mon 31
尝试使用此npm 包生成随机用户代理。这通常可以解决基于用户代理的保护。
在 puppeteer 页面中可以覆盖浏览器用户代理 page.setUserAgent
var userAgent = require('user-agents');
...
await page.setUserAgent(userAgent.toString())
Run Code Online (Sandbox Code Playgroud)
此外,您可以添加这两个额外的插件,
puppeteer-extra-plugin-recaptcha - 使用一行代码自动解决 reCAPTCHA:page.solveRecaptchas()
注意:puppeteer-extra-plugin-recaptcha使用付费服务2captcha
puppeteer-extra-plugin-stealth - 应用各种规避技术使无头傀儡的检测更加困难。
com*_*tos 19
以下是我为绕过验证码和类似阻止而正在做的事情的列表:
const randomUseragent = require('random-useragent');
//Enable stealth mode
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36';
async function createPage (browser,url) {
//Randomize User agent or Set a valid one
const userAgent = randomUseragent.getRandom();
const UA = userAgent || USER_AGENT;
const page = await browser.newPage();
//Randomize viewport size
await page.setViewport({
width: 1920 + Math.floor(Math.random() * 100),
height: 3000 + Math.floor(Math.random() * 100),
deviceScaleFactor: 1,
hasTouch: false,
isLandscape: false,
isMobile: false,
});
await page.setUserAgent(UA);
await page.setJavaScriptEnabled(true);
await page.setDefaultNavigationTimeout(0);
//Skip images/styles/fonts loading for performance
await page.setRequestInterception(true);
page.on('request', (req) => {
if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){
req.abort();
} else {
req.continue();
}
});
await page.evaluateOnNewDocument(() => {
// Pass webdriver check
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
await page.evaluateOnNewDocument(() => {
// Pass chrome check
window.chrome = {
runtime: {},
// etc.
};
});
await page.evaluateOnNewDocument(() => {
//Pass notifications check
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `languages` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
await page.goto(url, { waitUntil: 'networkidle2',timeout: 0 } );
return page;
}Run Code Online (Sandbox Code Playgroud)
您是否尝试过设置浏览器代理?
await page.setUserAgent('5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36');
Run Code Online (Sandbox Code Playgroud)
经过几次测试后,有几个软件包帮助我避免了重新验证:
//const puppeteer = require('puppeteer');
const puppeteerExtra = require('puppeteer-extra');
const pluginStealth = require('puppeteer-extra-plugin-stealth');
const randomUseragent = require('random-useragent');
class PuppeteerService {
constructor() {
this.browser = null;
this.page = null;
this.pageOptions = null;
this.waitForFunction = null;
this.isLinkCrawlTest = null;
}
async initiate(countsLimitsData, isLinkCrawlTest) {
this.pageOptions = {
waitUntil: 'networkidle2',
timeout: countsLimitsData.millisecondsTimeoutSourceRequestCount
};
this.waitForFunction = 'document.querySelector("body")';
puppeteerExtra.use(pluginStealth());
//const browser = await puppeteerExtra.launch({ headless: false });
this.browser = await puppeteerExtra.launch({ headless: false });
this.page = await this.browser.newPage();
await this.page.setRequestInterception(true);
this.page.on('request', (request) => {
if (['image', 'stylesheet', 'font', 'script'].indexOf(request.resourceType()) !== -1) {
request.abort();
} else {
request.continue();
}
});
this.isLinkCrawlTest = isLinkCrawlTest;
}
async crawl(link) {
const userAgent = randomUseragent.getRandom();
const crawlResults = { isValidPage: true, pageSource: null };
try {
await this.page.setUserAgent(userAgent);
await this.page.goto(link, this.pageOptions);
await this.page.waitForFunction(this.waitForFunction);
crawlResults.pageSource = await this.page.content();
}
catch (error) {
crawlResults.isValidPage = false;
}
if (this.isLinkCrawlTest) {
this.close();
}
return crawlResults;
}
close() {
if (!this.browser) {
this.browser.close();
}
}
}
const puppeteerService = new PuppeteerService();
module.exports = puppeteerService;
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
33636 次 |
| 最近记录: |