我正在使用 进行数据提取任务R。数据是在Power BI仪表板中分配的,所以获取起来非常麻烦。我在这里找到了解决方案:
但我不确定如何在页面中导航以获取组件并提取表格。我的代码如下:
library(wdman)
library(RSelenium)
library(xml2)
library(selectr)
library(tidyverse)
library(rvest)
# using wdman to start a selenium server
remDr <- rsDriver(
port = 4445L,
browser = "firefox"
)
#remDr$open()
remDr <- remoteDriver(port = 4445L,browser = "firefox")
# open a new Tab on Chrome
remDr$open()
# navigate to the site you wish to analyze
report_url <- "https://app.powerbi.com/view?r=eyJrIjoiOGI5Yzg2MGYtZmNkNy00ZjA5LTlhYTYtZTJjNjg2NTY2YTlmIiwidCI6ImI1NDE0YTdiLTcwYTYtNGUyYi05Yzc0LTM1Yjk0MDkyMjk3MCJ9"
remDr$navigate(report_url)
# fetch the data
data_table <- read_html(remDr$getPageSource()[[1]]) %>%
querySelector("div.pivotTable")
Run Code Online (Sandbox Code Playgroud)
虽然硒进程工作,但我不知道如何获取我的表:
蓝色箭头显示了我想要的表格,然后我需要移动到其他页面来提取剩余的表格。但我想如果第一页能做到的话,其他页面也会一样。
非常感谢!
这些表有点棘手,因为只有滚动激活它们后,新行才会出现在页面源代码中。我的解决方案是一个函数,可以逐行抓取并单独添加到整体数据帧中,必要时滚动。它将 Power BI 表的可视容器编号作为输入。
\nR 中的解决方案如下:
\nlibrary(wdman)\nlibrary(RSelenium)\n\nselServ <- selenium(\n port = 4444L,\n version = \'latest\',\n chromever = \'103.0.5060.134\', # set to available\n)\n\nremDr <- remoteDriver(\n remoteServerAddr = \'localhost\',\n port = 4444L,\n browserName = \'chrome\'\n)\n\nscrape_powerbi_table <- function(container_number) {\n table_xpath <- paste("//*[@id=\'pvExplorationHost\']/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[",\n container_number, "]/transform/div/div[3]/div/visual-modern", sep = "")\n Sys.sleep(1)\n try({scroll_button <- remDr$findElement("xpath", paste(table_xpath, "/div/div/div[2]/div[4]/div[2]", sep = ""))\n remDr$mouseMoveToLocation(webElement = scroll_button)}, silent = TRUE)\n col_names <- remDr$findElements("xpath", paste(table_xpath, "/div/div/div[2]/div[1]/div[2]/div[2]/div/div", sep = ""))\n col_names <- vapply(col_names, function(x) stringr::str_split(x$getElementAttribute(\'innerHTML\')[[1]], "<")[[1]][1], character(1))\n df <- data.frame(matrix(ncol = length(col_names), nrow = 0))\n colnames(df) <- col_names\n more_rows_left <- TRUE\n row_count <- 2\n while (more_rows_left == TRUE) {\n data <- remDr$findElements("xpath", paste(table_xpath, "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'", row_count, "\']/div", sep = ""))\n current_row <- vapply(data, function(x) x$getElementAttribute(\'innerHTML\')[[1]], character(1))\n current_row <- current_row[2:length(current_row)]\n if (length(current_row) == 0 | all(is.na(current_row))) {\n tryCatch({for (i in seq(10)) scroll_button$click()\n data <- remDr$findElements("xpath", paste(table_xpath, "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'", row_count, "\']/div", sep = ""))\n current_row <- vapply(data, function(x) x$getElementAttribute(\'innerHTML\')[[1]], character(1))\n current_row <- current_row[2:length(current_row)]\n }, error = function (e) break)\n }\n if (length(current_row) == 0 | all(is.na(current_row))) {break}\n df[nrow(df) + 1,] <- current_row\n row_count <- row_count + 1\n }\n df\n}\n\nremDr$open()\n\nremDr$navigate("https://app.powerbi.com/view?r=eyJrIjoiOGI5Yzg2MGYtZmNkNy00ZjA5LTlhYTYtZTJjNjg2NTY2YTlmIiwidCI6ImI1NDE0YTdiLTcwYTYtNGUyYi05Yzc0LTM1Yjk0MDkyMjk3MCJ9")\nSys.sleep(15)\nnext_button <- remDr$findElement("xpath", "//*[@id=\'embedWrapperID\']/div[2]/logo-bar/div/div/div/logo-bar-navigation/span/button[2]")\n\ndf1 <- scrape_powerbi_table(8)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\ndf2 <- scrape_powerbi_table(8)\ndf3 <- scrape_powerbi_table(9)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\nnext_button$click()\ndf4 <- scrape_powerbi_table(5)\ndf5 <- scrape_powerbi_table(7)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\ndf6 <- scrape_powerbi_table(9)\ndf7 <- scrape_powerbi_table(10)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\nnext_button$click()\ndf8 <- scrape_powerbi_table(2)\nremDr$mouseMoveToLocation(webElement = next_button)\nnext_button$click()\ndf9 <- scrape_powerbi_table(5)\ndf10 <- scrape_powerbi_table(6)\n\n# > df9\n# Zona Provincia Total Establecimientos\n# 1 ZONA 1 CARCHI 8\n# 2 ZONA 1 ESMERALDAS 6\n# 3 ZONA 1 IMBABURA 32\n# 4 ZONA 1 SUCUMBIOS 27\n# 5 ZONA 2 NAPO 9\n# 6 ZONA 2 ORELLANA 30\n# 7 ZONA 2 PICHINCHA 63\n# 8 ZONA 3 CHIMBORAZO 56\n# 9 ZONA 3 COTOPAXI 54\n# 10 ZONA 3 PASTAZA 13\n# 11 ZONA 3 TUNGURAHUA 122\n# 12 ZONA 4 MANABI 127\n# 13 ZONA 4 SANTO DOMINGO DE LOS TSACHILAS 49\n# 14 ZONA 5 BOLIVAR 24\n# 15 ZONA 5 GALAPAGOS 5\n# 16 ZONA 5 GUAYAS 27\n# 17 ZONA 5 LOS RIOS 53\n# 18 ZONA 5 SANTA ELENA 18\n# 19 ZONA 6 AZUAY 182\n# 20 ZONA 6 CA\xc3\x91AR 35\n# 21 ZONA 6 MORONA SANTIAGO 23\n# 22 ZONA 7 EL ORO 65\n# 23 ZONA 7 LOJA 48\n# 24 ZONA 7 ZAMORA CHINCHIPE 16\n# 25 ZONA 8 GUAYAS 86\n# 26 ZONA 9 PICHINCHA 309\nRun Code Online (Sandbox Code Playgroud)\n在 Python 中:
\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.webdriver.support import expected_conditions as EC\nimport pandas as pd\n\ndriver = webdriver.Chrome()\n\ndriver.get("https://app.powerbi.com/view?r=eyJrIjoiOGI5Yzg2MGYtZmNkNy00ZjA5LTlhYTYtZTJjNjg2NTY2YTlmIiwidCI6ImI1NDE0YTdiLTcwYTYtNGUyYi05Yzc0LTM1Yjk0MDkyMjk3MCJ9")\n\ndef scrape_powerbi_table(visual_container_number):\n table_xpath = "//*[@id=\'pvExplorationHost\']/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[" + str(visual_container_number) + "]/transform/div/div[3]/div/visual-modern"\n scroll_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, table_xpath + "/div/div/div[2]/div[4]/div[2]")))\n col_names = [i.text for i in driver.find_elements(By.XPATH, table_xpath + "/div/div/div[2]/div[1]/div[2]/div[2]/div/div")]\n df = pd.DataFrame(columns = col_names)\n more_rows_left = True\n row_count = 2\n while more_rows_left == True:\n data = driver.find_elements(By.XPATH, table_xpath + "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'" + str(row_count) + "\']/div")\n current_row = [i.get_attribute("innerHTML") for i in data][1:]\n if not current_row:\n try:\n for i in range(10):\n scroll_button.click()\n data = driver.find_elements(By.XPATH, table_xpath + "/div/div/div[2]/div[1]/div[4]/div/div[@aria-rowindex=\'" + str(row_count) + "\']/div")\n current_row = [i.get_attribute("innerHTML") for i in data][1:]\n except Exception:\n break\n if not current_row:\n break\n df.loc[len(df)] = current_row\n row_count += 1\n return df\n\nnext_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@id=\'embedWrapperID\']/div[2]/logo-bar/div/div/div/logo-bar-navigation/span/button[2]")))\n\ndf1 = scrape_powerbi_table(8)\nnext_button.click()\ndf2 = scrape_powerbi_table(8)\ndf3 = scrape_powerbi_table(9)\nnext_button.click()\nnext_button.click()\ndf4 = scrape_powerbi_table(5)\ndf5 = scrape_powerbi_table(7)\nnext_button.click()\ndf6 = scrape_powerbi_table(9)\ndf7 = scrape_powerbi_table(10)\nnext_button.click()\nnext_button.click()\ndf8 = scrape_powerbi_table(2)\nnext_button.click()\ndf9 = scrape_powerbi_table(5)\ndf10 = scrape_powerbi_table(6)\n\ndriver.quit()\nRun Code Online (Sandbox Code Playgroud)\n另外,为了方便起见,这里还有 10 个 csv 格式的文件。
\nhttps://mega.nz/folder/LtVDiCyQ#5iW1mkd1VVTmcPApeqfGFA
\n