Puppeteer Bright Data 代理返回 ERR_NO_SUPPORTED_PROXY 或 CERT 错误

nic*_*ng2 9 javascript proxy node.js web-scraping puppeteer

所以我继续使用 Bright Data,注册了一个帐户,并获得了我的 Search Engine Crawler 代理。下面是我的抓取功能:

async function scrape() {
  try {
    const preparePageForTests = async (page) => {

          const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36';//'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36';

          await page.setUserAgent(userAgent);

          await page.evaluateOnNewDocument(() => {
            Object.defineProperty(navigator, 'webdriver', {
              get: () => false,
            });
          });

          // Pass the Chrome Test.
          await page.evaluateOnNewDocument(() => {
            // We can mock this in as much depth as we need for the test.
            window.navigator.chrome = {
              app: {
                isInstalled: false,
              },
              webstore: {
                onInstallStageChanged: {},
                onDownloadProgress: {},
              },
              runtime: {
                PlatformOs: {
                  MAC: 'mac',
                  WIN: 'win',
                  ANDROID: 'android',
                  CROS: 'cros',
                  LINUX: 'linux',
                  OPENBSD: 'openbsd',
                },
                PlatformArch: {
                  ARM: 'arm',
                  X86_32: 'x86-32',
                  X86_64: 'x86-64',
                },
                PlatformNaclArch: {
                  ARM: 'arm',
                  X86_32: 'x86-32',
                  X86_64: 'x86-64',
                },
                RequestUpdateCheckStatus: {
                  THROTTLED: 'throttled',
                  NO_UPDATE: 'no_update',
                  UPDATE_AVAILABLE: 'update_available',
                },
                OnInstalledReason: {
                  INSTALL: 'install',
                  UPDATE: 'update',
                  CHROME_UPDATE: 'chrome_update',
                  SHARED_MODULE_UPDATE: 'shared_module_update',
                },
                OnRestartRequiredReason: {
                  APP_UPDATE: 'app_update',
                  OS_UPDATE: 'os_update',
                  PERIODIC: 'periodic',
                },
              }
            };
          });

          await page.evaluateOnNewDocument(() => {
            const originalQuery = window.navigator.permissions.query;
            return window.navigator.permissions.query = (parameters) => (
              parameters.name === 'notifications' ?
                Promise.resolve({ state: Notification.permission }) :
                originalQuery(parameters)
            );
          });

          await page.evaluateOnNewDocument(() => {
            // Overwrite the `plugins` property to use a custom getter.
            Object.defineProperty(navigator, 'plugins', {
              // This just needs to have `length > 0` for the current test,
              // but we could mock the plugins too if necessary.
              get: () => [1, 2, 3, 4, 5],
            });
          });

          await page.evaluateOnNewDocument(() => {
            // Overwrite the `plugins` property to use a custom getter.
            Object.defineProperty(navigator, 'languages', {
              get: () => ['en-US', 'en'],
            });
          });
        }

        //the below is the Search Engine Crawler proxy used from the luminati/bright data sign up. This returns ERR_CERT_INVALID or ERR_CERT_AUTHORITY_INVALID
        const oldProxyUrl = 'http://lum-customer-customerID-zone-zone1:customerPassword@zproxy.lum-superproxy.io:22225'
        const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl); //if this line is commented out, I get the ERR_NO_SUPPORTED_PROXY

        const browser = await puppeteerExtra.launch({ 
          headless: true, 
          args: [                
            '--no-sandbox', 
            '--disable-setuid-sandbox', 
            `--proxy-server=${newProxyUrl}`
            //If I add 'ignoreHTTPSErrors: true' here then I can bypass the CERT errors but then it seems like I can't navigate the browser anymore to a different page.                     
          ]
        });

        const page = await browser.newPage();

        await preparePageForTests(page);

        await page.setViewport({ width: 1440, height: 1080 });

        await page.goto('https://www.google.com/search?q=concerts+near+new+york');   
        
        await page.screenshot({ path: `screenshot.jpeg` });

  } catch(err) {
    console.log(err)
  }
}
Run Code Online (Sandbox Code Playgroud)

不知道如何解决这个问题。我相信这里的错误是使用 ignoreHttpsErrors 绕过 CERT 错误。当我根本不使用代理时,我的分析功能(它基本上接受下面看到的第一个“ul”列表)工作正常,但如果我使用代理,它出于某种原因给我第二页上的数据.

任何帮助将非常感激!

“ul”的格式很好,数据很容易在:https : //i.stack.imgur.com/RwiHM.jpg

只有几个“ul”元素是可见的,然后我得到了一堆我不想返回的东西。我试着做一个

page.$eval(".BXE0fe", element => element.click())
Run Code Online (Sandbox Code Playgroud)

但这不是出于某种原因重定向页面:https : //i.stack.imgur.com/3DTay.png

小智 0

Bright Data 支持绝对可以帮助您解决这个问题 - 您可以通过控制面板中的聊天气泡或 Skype 与他们联系:luminati.io

此问题取决于您所定位的域(Bright Data 会阻止 Google 域,您应该使用 Google 的 SERP 区域)