如何使用 R 中的网络抓取从 Power BI 仪表板获取表格

use*_*007 3 r rselenium rvest

我正在使用 进行数据提取任务R。数据是在Power BI仪表板中分配的,所以获取起来非常麻烦。我在这里找到了解决方案:

使用 R 抓取网站的 Power BI 仪表板

但我不确定如何在页面中导航以获取组件并提取表格。我的代码如下:

library(wdman)
library(RSelenium)
library(xml2)
library(selectr)
library(tidyverse)
library(rvest)

# using wdman to start a selenium server
remDr <- rsDriver(
  port = 4445L,
  browser = "firefox"
)
#remDr$open()
remDr <- remoteDriver(port = 4445L,browser = "firefox")

# open a new Tab on Chrome
remDr$open()

# navigate to the site you wish to analyze
report_url <- "https://app.powerbi.com/view?r=eyJrIjoiOGI5Yzg2MGYtZmNkNy00ZjA5LTlhYTYtZTJjNjg2NTY2YTlmIiwidCI6ImI1NDE0YTdiLTcwYTYtNGUyYi05Yzc0LTM1Yjk0MDkyMjk3MCJ9"
remDr$navigate(report_url)

# fetch the data
data_table <- read_html(remDr$getPageSource()[[1]]) %>%
  querySelector("div.pivotTable")
Run Code Online (Sandbox Code Playgroud)

虽然硒进程工作,但我不知道如何获取我的表:

在此输入图像描述

蓝色箭头显示了我想要的表格,然后我需要移动到其他页面来提取剩余的表格。但我想如果第一页能做到的话,其他页面也会一样。

非常感谢!

dcs*_*uka 8

这些表有点棘手,因为只有滚动激活它们后,新行才会出现在页面源代码中。我的解决方案是一个函数,可以逐行抓取并单独添加到整体数据帧中,必要时滚动。它将 Power BI 表的可视容器编号作为输入。

\n

R 中的解决方案如下:

\n
library(wdman)\nlibrary(RSelenium)\n\nselServ <- selenium(\n  port = 4444L,\n  version = \'latest\',\n  chromever = \'103.0.5060.134\', # set to available\n)\n\nremDr <- remoteDriver(\n  remoteServerAddr = \'localhost\',\n  port = 4444L,\n  browserName = \'chrome\'\n)\n\nscrape_powerbi_table <- function(container_number) {\n  table_xpath <- paste("//*[@id=\'pvExplorationHost\']/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[",\n                       container_number, "]/transform/div/div[3]/div/visual-modern", sep = "")\n  Sys.sleep(1)\n  try({scroll_button <- remDr$findElement("xpath", paste(table_xpath, "/div/div/div[2]/div[4]/div[2]", sep = ""))\n    remDr$mouseMoveToLocation(webElement = scroll_button)}, silent = TRUE)\n  col_names <- remDr$findElements("xpath", paste(table_xpath, "/div/div/div[2]/div[1]/div[2]/div[2]/div/div", sep = ""))\n  col_names <- vapply(col_names, function(x) stringr::str_split(x$getElementAttribute(\'innerHTML\')[[1]], "<")[[1]][1], character(1))\n  df <- data.frame(matrix(ncol = length(col_names), nrow = 0))\n  colnames(df) <- col_names\n  more_rows_left <- TRUE\n  row_count <- 2\n  while (more_rows_left == TRUE) {\n    data <- remDr$findElements("xpath", paste(table_xpath, "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'", row_count, "\']/div", sep = ""))\n    current_row <- vapply(data, function(x) x$getElementAttribute(\'innerHTML\')[[1]], character(1))\n    current_row <- current_row[2:length(current_row)]\n    if (length(current_row) == 0 | all(is.na(current_row))) {\n      tryCatch({for (i in seq(10)) scroll_button$click()\n        data <- remDr$findElements("xpath", paste(table_xpath, "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'", row_count, "\']/div", sep = ""))\n        current_row <- vapply(data, function(x) x$getElementAttribute(\'innerHTML\')[[1]], character(1))\n        current_row <- current_row[2:length(current_row)]\n        }, error = function (e) break)\n    }\n    if (length(current_row) == 0 | all(is.na(current_row))) {break}\n    df[nrow(df) + 1,] <- current_row\n    row_count <- row_count + 1\n  }\n  df\n}\n\nremDr$open()\n\nremDr$navigate("https://app.powerbi.com/view?r=eyJrIjoiOGI5Yzg2MGYtZmNkNy00ZjA5LTlhYTYtZTJjNjg2NTY2YTlmIiwidCI6ImI1NDE0YTdiLTcwYTYtNGUyYi05Yzc0LTM1Yjk0MDkyMjk3MCJ9")\nSys.sleep(15)\nnext_button <- remDr$findElement("xpath", "//*[@id=\'embedWrapperID\']/div[2]/logo-bar/div/div/div/logo-bar-navigation/span/button[2]")\n\ndf1 <- scrape_powerbi_table(8)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\ndf2 <- scrape_powerbi_table(8)\ndf3 <- scrape_powerbi_table(9)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\nnext_button$click()\ndf4 <- scrape_powerbi_table(5)\ndf5 <- scrape_powerbi_table(7)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\ndf6 <- scrape_powerbi_table(9)\ndf7 <- scrape_powerbi_table(10)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\nnext_button$click()\ndf8 <- scrape_powerbi_table(2)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\ndf9 <- scrape_powerbi_table(5)\ndf10 <- scrape_powerbi_table(6)\n\n# > df9\n#      Zona                      Provincia Total Establecimientos\n# 1  ZONA 1                         CARCHI                      8\n# 2  ZONA 1                     ESMERALDAS                      6\n# 3  ZONA 1                       IMBABURA                     32\n# 4  ZONA 1                      SUCUMBIOS                     27\n# 5  ZONA 2                           NAPO                      9\n# 6  ZONA 2                       ORELLANA                     30\n# 7  ZONA 2                      PICHINCHA                     63\n# 8  ZONA 3                     CHIMBORAZO                     56\n# 9  ZONA 3                       COTOPAXI                     54\n# 10 ZONA 3                        PASTAZA                     13\n# 11 ZONA 3                     TUNGURAHUA                    122\n# 12 ZONA 4                         MANABI                    127\n# 13 ZONA 4 SANTO DOMINGO DE LOS TSACHILAS                     49\n# 14 ZONA 5                        BOLIVAR                     24\n# 15 ZONA 5                      GALAPAGOS                      5\n# 16 ZONA 5                         GUAYAS                     27\n# 17 ZONA 5                       LOS RIOS                     53\n# 18 ZONA 5                    SANTA ELENA                     18\n# 19 ZONA 6                          AZUAY                    182\n# 20 ZONA 6                          CA\xc3\x91AR                     35\n# 21 ZONA 6                MORONA SANTIAGO                     23\n# 22 ZONA 7                         EL ORO                     65\n# 23 ZONA 7                           LOJA                     48\n# 24 ZONA 7               ZAMORA CHINCHIPE                     16\n# 25 ZONA 8                         GUAYAS                     86\n# 26 ZONA 9                      PICHINCHA                    309\n
Run Code Online (Sandbox Code Playgroud)\n

在 Python 中:

\n
from selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.webdriver.support import expected_conditions as EC\nimport pandas as pd\n\ndriver = webdriver.Chrome()\n\ndriver.get("https://app.powerbi.com/view?r=eyJrIjoiOGI5Yzg2MGYtZmNkNy00ZjA5LTlhYTYtZTJjNjg2NTY2YTlmIiwidCI6ImI1NDE0YTdiLTcwYTYtNGUyYi05Yzc0LTM1Yjk0MDkyMjk3MCJ9")\n\ndef scrape_powerbi_table(visual_container_number):\n    table_xpath = "//*[@id=\'pvExplorationHost\']/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[" + str(visual_container_number) + "]/transform/div/div[3]/div/visual-modern"\n    scroll_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, table_xpath + "/div/div/div[2]/div[4]/div[2]")))\n    col_names = [i.text for i in driver.find_elements(By.XPATH, table_xpath + "/div/div/div[2]/div[1]/div[2]/div[2]/div/div")]\n    df = pd.DataFrame(columns = col_names)\n    more_rows_left = True\n    row_count = 2\n    while more_rows_left == True:\n        data = driver.find_elements(By.XPATH, table_xpath + "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'" + str(row_count) + "\']/div")\n        current_row = [i.get_attribute("innerHTML") for i in data][1:]\n        if not current_row:\n            try:\n                for i in range(10):\n                    scroll_button.click()\n                data = driver.find_elements(By.XPATH, table_xpath + "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'" + str(row_count) + "\']/div")\n                current_row = [i.get_attribute("innerHTML") for i in data][1:]\n            except Exception:\n                break\n        if not current_row:\n            break\n        df.loc[len(df)] = current_row\n        row_count += 1\n    return df\n\nnext_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@id=\'embedWrapperID\']/div[2]/logo-bar/div/div/div/logo-bar-navigation/span/button[2]")))\n\ndf1 = scrape_powerbi_table(8)\nnext_button.click()\ndf2 = scrape_powerbi_table(8)\ndf3 = scrape_powerbi_table(9)\nnext_button.click()\nnext_button.click()\ndf4 = scrape_powerbi_table(5)\ndf5 = scrape_powerbi_table(7)\nnext_button.click()\ndf6 = scrape_powerbi_table(9)\ndf7 = scrape_powerbi_table(10)\nnext_button.click()\nnext_button.click()\ndf8 = scrape_powerbi_table(2)\nnext_button.click()\ndf9 = scrape_powerbi_table(5)\ndf10 = scrape_powerbi_table(6)\n\ndriver.quit()\n
Run Code Online (Sandbox Code Playgroud)\n

另外,为了方便起见,这里还有 10 个 csv 格式的文件。

\n

https://mega.nz/folder/LtVDiCyQ#5iW1mkd1VVTmcPApeqfGFA

\n