如何使用 Selenium 从 devtools Network 面板中检索“Initiator”字段?

Adr*_*ian 5 python selenium google-chrome-devtools har selenium-webdriver

我正在尝试从网站中提取所有 URL 网络请求,并在它们之间建立层次关系,即,如果一个 URL 请求正在生成另一个请求。类似于请求链。

如您所知,在 Network 面板中,Requests 表中有一个名为“Initiator”的字段,它告诉您特定请求的来源或父请求(如果有的话)。手动,我可以使用浏览器,转到开发人员工具中的网络面板,加载网站并下载生成的 HAR 文件。例如:

{
        "startedDateTime": "2019-11-05T17:38:46.775Z",
        "time": 15.676000155508518,
        "request": {
          "method": "POST",
          "url": "https://www.google.com/gen_204?oq=&gs_l=psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz.",
          "httpVersion": "http/2.0",
          "headers": [
            {
              "name": ":path",
              "value": "/gen_204?oq=&gs_l=psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz."
            },
            {
              "name": "sec-fetch-mode",
              "value": "no-cors"
            },
            {
              "name": "origin",
              "value": "https://www.google.com"
            },
            {
              "name": "accept-encoding",
              "value": "gzip, deflate, br"
            },
            {
              "name": "accept-language",
              "value": "en-GB,en;q=0.9,en-US;q=0.8,es-US;q=0.7,es;q=0.6"
            },
            {
              "name": "user-agent",
              "value": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/76.0.3809.100 Chrome/76.0.3809.100 Safari/537.36"
            },
            {
              "name": "content-type",
              "value": "text/plain;charset=UTF-8"
            },
            {
              "name": "accept",
              "value": "*/*"
            },
            {
              "name": "referer",
              "value": "https://www.google.com/"
            },
            {
              "name": ":authority",
              "value": "www.google.com"
            },
            {
              "name": "cookie",
              "value": "CONSENT=YES+GB.en+20160414-00-0; SEARCH_SAMESITE=CgQIg44B; ANID=AHWqTUlE3OPRfM5R1dtW0XvyIu2NOdLWoSHEgFemsFslXQTIzFKFCL-7kTtDZAr_; NID=190=Ezp7tXRaU_Rs2BS9RprlsS9QN9-PcwpYNSLwaOVGVFFp6pWepIjDqsYlgyLqb2eATn6HwUNs-SmgzAmtEm63fgX-YWVgbOyX7GU1esPamrN-GWXfwmXyrsqsTBOOQTzsHB3Q89tATDNQE_OKGd0YgCxMp9m9QXke2BJANdKdBYujl-g5tS8ZXcq0pw; 1P_JAR=2019-11-05-17; DV=o32RqCcqMlgsAJonGalrPPWlv0DK4xZ24gV5ztaaewMAAAA"
            },
            {
              "name": ":scheme",
              "value": "https"
            },
            {
              "name": "sec-fetch-site",
              "value": "same-origin"
            },
            {
              "name": "content-length",
              "value": "0"
            },
            {
              "name": ":method",
              "value": "POST"
            }
          ],
          "queryString": [
            {
              "name": "oq",
              "value": ""
            },
            {
              "name": "gs_l",
              "value": "psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz."
            }
          ],
          "cookies": [
            {
              "name": "CONSENT",
              "value": "YES+GB.en+20160414-00-0",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "SEARCH_SAMESITE",
              "value": "CgQIg44B",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "ANID",
              "value": "AHWqTUlE3OPRfM5R1dtW0XvyIu2NOdLWoSHEgFemsFslXQTIzFKFCL-7kTtDZAr_",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "NID",
              "value": "190=Ezp7tXRaU_Rs2BS9RprlsS9QN9-PcwpYNSLwaOVGVFFp6pWepIjDqsYlgyLqb2eATn6HwUNs-SmgzAmtEm63fgX-YWVgbOyX7GU1esPamrN-GWXfwmXyrsqsTBOOQTzsHB3Q89tATDNQE_OKGd0YgCxMp9m9QXke2BJANdKdBYujl-g5tS8ZXcq0pw",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "1P_JAR",
              "value": "2019-11-05-17",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "DV",
              "value": "o32RqCcqMlgsAJonGalrPPWlv0DK4xZ24gV5ztaaewMAAAA",
              "expires": null,
              "httpOnly": false,
              "secure": false
            }
          ],
          "headersSize": -1,
          "bodySize": 0
        },
        "response": {
          "status": 204,
          "statusText": "",
          "httpVersion": "http/2.0",
          "headers": [
            {
              "name": "date",
              "value": "Tue, 05 Nov 2019 17:38:46 GMT"
            },
            {
              "name": "server",
              "value": "gws"
            },
            {
              "name": "x-frame-options",
              "value": "SAMEORIGIN"
            },
            {
              "name": "content-type",
              "value": "text/html; charset=UTF-8"
            },
            {
              "name": "status",
              "value": "204"
            },
            {
              "name": "alt-svc",
              "value": "quic=\":443\"; ma=2592000; v=\"46,43\",h3-Q050=\":443\"; ma=2592000,h3-Q049=\":443\"; ma=2592000,h3-Q048=\":443\"; ma=2592000,h3-Q046=\":443\"; ma=2592000,h3-Q043=\":443\"; ma=2592000"
            },
            {
              "name": "content-length",
              "value": "0"
            },
            {
              "name": "x-xss-protection",
              "value": "0"
            }
          ],
          "cookies": [],
          "content": {
            "size": 0,
            "mimeType": "text/html"
          },
          "redirectURL": "",
          "headersSize": -1,
          "bodySize": -1,
          "_transferSize": 54
        },
        "cache": {},
        "timings": {
          "blocked": 1.1320006029605865,
          "dns": -1,
          "ssl": -1,
          "connect": -1,
          "send": 0.16199999999999992,
          "wait": 14.122000366747379,
          "receive": 0.25999918580055237,
          "_blocked_queueing": 0.5990006029605865
        },
        "serverIPAddress": "216.58.204.68",
        "_initiator": {
          "type": "script",
          "stack": {
            "callFrames": [
              {
                "functionName": "s_1pb",
                "scriptId": "129",
                "url": "https://www.google.com/xjs/_/js/k=xjs.s.en_GB.UISl_YucLj8.O/ck=xjs.s.or8k_ixGu54.L.W.O/m=Fkg7bd,HcFEGb,IvlUe,MC8mtf,OF7gzc,RMhBfe,T4BAC,TJw5qb,TbaHGc,Y33vzc,cdos,hsm,iDPoPb,jsa,mvYTse,tg8oTe,uz938c,vWNDde,ws9Tlc,yQ43ff,d,csi/am=BAAAsAjYuwOC_L8VAAQAfAYAAAFuwQYLhCGhYqwOEAE/d=1/dg=2/br=1/ct=zgms/rs=ACT90oGdwE1ooFdbHyz-Vk2BhYjwAv-QDQ",
                "lineNumber": 2323,
                "columnNumber": 376
              },
Run Code Online (Sandbox Code Playgroud)

在这种情况下,URLhttps://www.google.com/gen_204?oq=&gs_l=psy-ab.22...由 URL 发起https://www.google.com/xjs/_/js/k=xjs.s.en_GB.UISl...。您可以在关键“发起者”->“callFrames”->“url”中看到此信息。

想法是获取此信息(调用他人的 URL)或使用 Selenium 自动下载 HAR 文件。我试过这个:

1. Selenium + browsermob 代理 问题:生成的 HAR 文件没有“Initiator”字段,并且无法连接启动器请求及其依赖项。

2. Selenium 性能日志 我使用此代码从 Selenium 获取性能日志:

{
        "startedDateTime": "2019-11-05T17:38:46.775Z",
        "time": 15.676000155508518,
        "request": {
          "method": "POST",
          "url": "https://www.google.com/gen_204?oq=&gs_l=psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz.",
          "httpVersion": "http/2.0",
          "headers": [
            {
              "name": ":path",
              "value": "/gen_204?oq=&gs_l=psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz."
            },
            {
              "name": "sec-fetch-mode",
              "value": "no-cors"
            },
            {
              "name": "origin",
              "value": "https://www.google.com"
            },
            {
              "name": "accept-encoding",
              "value": "gzip, deflate, br"
            },
            {
              "name": "accept-language",
              "value": "en-GB,en;q=0.9,en-US;q=0.8,es-US;q=0.7,es;q=0.6"
            },
            {
              "name": "user-agent",
              "value": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/76.0.3809.100 Chrome/76.0.3809.100 Safari/537.36"
            },
            {
              "name": "content-type",
              "value": "text/plain;charset=UTF-8"
            },
            {
              "name": "accept",
              "value": "*/*"
            },
            {
              "name": "referer",
              "value": "https://www.google.com/"
            },
            {
              "name": ":authority",
              "value": "www.google.com"
            },
            {
              "name": "cookie",
              "value": "CONSENT=YES+GB.en+20160414-00-0; SEARCH_SAMESITE=CgQIg44B; ANID=AHWqTUlE3OPRfM5R1dtW0XvyIu2NOdLWoSHEgFemsFslXQTIzFKFCL-7kTtDZAr_; NID=190=Ezp7tXRaU_Rs2BS9RprlsS9QN9-PcwpYNSLwaOVGVFFp6pWepIjDqsYlgyLqb2eATn6HwUNs-SmgzAmtEm63fgX-YWVgbOyX7GU1esPamrN-GWXfwmXyrsqsTBOOQTzsHB3Q89tATDNQE_OKGd0YgCxMp9m9QXke2BJANdKdBYujl-g5tS8ZXcq0pw; 1P_JAR=2019-11-05-17; DV=o32RqCcqMlgsAJonGalrPPWlv0DK4xZ24gV5ztaaewMAAAA"
            },
            {
              "name": ":scheme",
              "value": "https"
            },
            {
              "name": "sec-fetch-site",
              "value": "same-origin"
            },
            {
              "name": "content-length",
              "value": "0"
            },
            {
              "name": ":method",
              "value": "POST"
            }
          ],
          "queryString": [
            {
              "name": "oq",
              "value": ""
            },
            {
              "name": "gs_l",
              "value": "psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz."
            }
          ],
          "cookies": [
            {
              "name": "CONSENT",
              "value": "YES+GB.en+20160414-00-0",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "SEARCH_SAMESITE",
              "value": "CgQIg44B",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "ANID",
              "value": "AHWqTUlE3OPRfM5R1dtW0XvyIu2NOdLWoSHEgFemsFslXQTIzFKFCL-7kTtDZAr_",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "NID",
              "value": "190=Ezp7tXRaU_Rs2BS9RprlsS9QN9-PcwpYNSLwaOVGVFFp6pWepIjDqsYlgyLqb2eATn6HwUNs-SmgzAmtEm63fgX-YWVgbOyX7GU1esPamrN-GWXfwmXyrsqsTBOOQTzsHB3Q89tATDNQE_OKGd0YgCxMp9m9QXke2BJANdKdBYujl-g5tS8ZXcq0pw",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "1P_JAR",
              "value": "2019-11-05-17",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "DV",
              "value": "o32RqCcqMlgsAJonGalrPPWlv0DK4xZ24gV5ztaaewMAAAA",
              "expires": null,
              "httpOnly": false,
              "secure": false
            }
          ],
          "headersSize": -1,
          "bodySize": 0
        },
        "response": {
          "status": 204,
          "statusText": "",
          "httpVersion": "http/2.0",
          "headers": [
            {
              "name": "date",
              "value": "Tue, 05 Nov 2019 17:38:46 GMT"
            },
            {
              "name": "server",
              "value": "gws"
            },
            {
              "name": "x-frame-options",
              "value": "SAMEORIGIN"
            },
            {
              "name": "content-type",
              "value": "text/html; charset=UTF-8"
            },
            {
              "name": "status",
              "value": "204"
            },
            {
              "name": "alt-svc",
              "value": "quic=\":443\"; ma=2592000; v=\"46,43\",h3-Q050=\":443\"; ma=2592000,h3-Q049=\":443\"; ma=2592000,h3-Q048=\":443\"; ma=2592000,h3-Q046=\":443\"; ma=2592000,h3-Q043=\":443\"; ma=2592000"
            },
            {
              "name": "content-length",
              "value": "0"
            },
            {
              "name": "x-xss-protection",
              "value": "0"
            }
          ],
          "cookies": [],
          "content": {
            "size": 0,
            "mimeType": "text/html"
          },
          "redirectURL": "",
          "headersSize": -1,
          "bodySize": -1,
          "_transferSize": 54
        },
        "cache": {},
        "timings": {
          "blocked": 1.1320006029605865,
          "dns": -1,
          "ssl": -1,
          "connect": -1,
          "send": 0.16199999999999992,
          "wait": 14.122000366747379,
          "receive": 0.25999918580055237,
          "_blocked_queueing": 0.5990006029605865
        },
        "serverIPAddress": "216.58.204.68",
        "_initiator": {
          "type": "script",
          "stack": {
            "callFrames": [
              {
                "functionName": "s_1pb",
                "scriptId": "129",
                "url": "https://www.google.com/xjs/_/js/k=xjs.s.en_GB.UISl_YucLj8.O/ck=xjs.s.or8k_ixGu54.L.W.O/m=Fkg7bd,HcFEGb,IvlUe,MC8mtf,OF7gzc,RMhBfe,T4BAC,TJw5qb,TbaHGc,Y33vzc,cdos,hsm,iDPoPb,jsa,mvYTse,tg8oTe,uz938c,vWNDde,ws9Tlc,yQ43ff,d,csi/am=BAAAsAjYuwOC_L8VAAQAfAYAAAFuwQYLhCGhYqwOEAE/d=1/dg=2/br=1/ct=zgms/rs=ACT90oGdwE1ooFdbHyz-Vk2BhYjwAv-QDQ",
                "lineNumber": 2323,
                "columnNumber": 376
              },
Run Code Online (Sandbox Code Playgroud)

在这里,可以找到“发起者”字段的唯一方法是在“Network.requestWillBeSent”方法中,该方法有一个 URL,但它与另一个无关。似乎每个“消息”字段都是独立的,而且无法连接发起方请求及其依赖项。

  1. driver.execute_script 我在这个问题中看到了这段代码:How to access Network panel on google chrome developer tools with selenium?
caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {'performance': 'ALL'}

chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(driver_path,desired_capabilities=caps)
driver.get("http://google.com")

browser_log = driver.get_log('performance')

Run Code Online (Sandbox Code Playgroud)

结果日志与 HAR 文件或性能日志完全不同,它没有任何可用于关联请求的信息。例子:

{
    "startTime": 0,
    "initiatorType": "navigation",
    "unloadEventStart": 0,
    "fetchStart": 69.29999962449074,
    "duration": 1311.3000001758337,
    "responseStart": 172.89999965578318,
    "nextHopProtocol": "h2",
    "transferSize": 68052,
    "connectStart": 70.19999995827675,
    "domainLookupStart": 70.19999995827675,
    "redirectStart": 0,
    "domContentLoadedEventEnd": 504.90000005811453,
    "responseEnd": 190.7999999821186,
    "requestStart": 100.49999970942736,
    "type": "navigate",
    "secureConnectionStart": 80.40000032633543,
    "connectEnd": 99.99999962747097,
    "redirectCount": 0,
    "workerStart": 0,
    "decodedBodySize": 233300,
    "loadEventStart": 1304.8000000417233,
    "encodedBodySize": 67329,
    "serverTiming": [],
    "entryType": "navigation",
    "domInteractive": 487.699999473989,
    "domContentLoadedEventStart": 487.80000023543835,
    "redirectEnd": 0,
    "name": "https://www.google.com/?gws_rd=ssl",
    "domainLookupEnd": 70.19999995827675,
    "unloadEventEnd": 0,
    "loadEventEnd": 1311.3000001758337,
    "domComplete": 1303.3999996259809,
    "toJSON": {}
  },
  {
    "initiatorType": "img",
    "fetchStart": 298.20000007748604,
    "duration": 14.100000262260437,
    "responseStart": 310.70000026375055,
    "responseEnd": 312.3000003397465,
    "transferSize": 6146,
    "connectStart": 298.20000007748604,
    "domainLookupStart": 298.20000007748604,
    "redirectStart": 0,
    "toJSON": {},
    "requestStart": 299.80000015348196,
    "secureConnectionStart": 0,
    "connectEnd": 298.20000007748604,
    "workerStart": 0,
    "decodedBodySize": 5969,
    "startTime": 298.20000007748604,
    "encodedBodySize": 5969,
    "serverTiming": [],
    "entryType": "resource",
    "redirectEnd": 0,
    "name": "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png",
    "domainLookupEnd": 298.20000007748604,
    "nextHopProtocol": "h2"
  },
Run Code Online (Sandbox Code Playgroud)

因此,我认为从开发者工具网络面板下载的 HAR 文件是允许在 URL 请求之间建立关系的唯一信息来源。由于我正在抓取许多网站,因此我需要使该过程自动化,但我找不到方法来做到这一点。

任何关于自动下载 HAR 文件以直接使用 Selenium 提取我需要的信息的想法都将不胜感激。

fee*_*ine 1

您可以_initiator从页面的raw_entry.

with open('at.har', 'r') as f:
    har_parser = HarParser(json.loads(f.read()))
pages_root = har_parser.pages[0]
initiator_dict = {}
for page in pages_root:
    if "url" in initiator:
        initiator_dict[page.request.url] = initiator['url']
res = defaultdict(list)
for key, val in sorted(initiator_dict.items()):
    res[val].append(key)
Run Code Online (Sandbox Code Playgroud)