使用 Python 和 Selenium 抓取时清空 pdf

Question

使用 Python 和 Selenium 抓取时清空 pdf

jat*_*ani 1 python webdriver web-scraping pandas selenium-chromedriver

我在尝试使用 Python 中的 Selenium 从网页打印 PDF 时遇到问题。有问题的网页是 https://jamabandi.nic.in/land%20records/NakalRecord。我尝试从每个下拉列表中选择第一条记录，然后单击“Nakal”按钮生成 PDF。

但是，即使网页上存在表格，生成的 PDF 始终为空。我尝试过手动打印到 PDF 操作和使用 Selenium 自动打印，但在这两种情况下，生成的 PDF 都是空的。

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()
options = webdriver.ChromeOptions()
# Set up preferences for printing to PDF
settings = {
    "recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}],
    "selectedDestinationId": "Save as PDF",
    "version": 2
}
prefs = {
    'printing.print_preview_sticky_settings.appState': json.dumps(settings),
    'printing.print_to_file': True,
    'printing.print_to_file.path': '/Users/jatin/Downloads/output.pdf'  # Specify the desired output path
}
chrome_options.add_experimental_option('prefs', prefs)

import urllib.request
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument('--headless')  # Optional: Run Chrome in headless mode
chrome_options.add_argument('--kiosk-printing')
try:
    service = Service(ChromeDriverManager().install())
except ValueError:
    latest_chromedriver_version_url = "https://chromedriver.storage.googleapis.com/LATEST_RELEASE"
    latest_chromedriver_version = urllib.request.urlopen(latest_chromedriver_version_url).read().decode('utf-8')
    service = Service(ChromeDriverManager(version=latest_chromedriver_version).install())

    
options = Options()
url='https://jamabandi.nic.in/land%20records/NakalRecord'
# options.add_argument('--headless') #optional.
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)


dropdown_district = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddldname"]'))
dropdown_district.select_by_index(1)
# Select the tehsil dropdown element and choose the first option,we will loop here for multiple anchals
drop_down_tehsil = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddltname"]'))
drop_down_tehsil.select_by_index(1)
drop_down_vill = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddlvname"]'))
drop_down_vill.select_by_index(1)
drop_down_year = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddlPeriod"]'))
drop_down_year.select_by_index(1)
owner_names=Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ListBox1"]'))
dropdown_locator = (By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ListBox1"]')
drop_down_owner = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddlOwner"]'))
drop_down_owner.select_by_index(1)
owner_names =Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ListBox1"]'))
owner_names.select_by_index(2)
page_source = BeautifulSoup(driver.page_source, 'html.parser')
table = page_source.find_all('table')
div_col_lg_12 = page_source.find('div', class_='col-lg-12')

# Find links within the selected div
links_within_div = div_col_lg_12.find_all('td')
links_within_div
# Perform actions on the links or retrieve their attributes
for link in links_within_div:
    k=link.find_all('a')
    if len(k)>0:
        new_link=(k[0]['href'])
        
javascript_code = str(new_link)

# Execute the JavaScript code
driver.execute_script(javascript_code)

window_handles=driver.window_handles
driver.switch_to.window(window_handles[-1])

# Open the print dialog using JavaScript
driver.execute_script('window.print();')

Run Code Online (Sandbox Code Playgroud)

Answer 1

Cor*_*ien 6

生成的 PDF 是空的，因为在文档中，您有：

\n

<style>\n    @media print\n    {\n        html,body\n        {\n            display:none;\n        }\n    }\n</style>\n

Run Code Online (Sandbox Code Playgroud)\n

因此文档内容在打印时是隐藏的。您需要删除标签<script>：

\n

driver.execute_script("document.querySelector(\'style\').remove()")\ndriver.execute_script("window.print()")\n

Run Code Online (Sandbox Code Playgroud)\n

\n

编辑：

\n

要直接从页面源提取数据，您可以使用pd.read_html：

\n

import io\n\npage_source = io.StringIO(driver.page_source)\ndf = pd.read_html(page_source, attrs={\'id\': \'GridView1\'})[0]\n

Run Code Online (Sandbox Code Playgroud)\n

输出：

\n

>>> df\n    \xe0\xa4\x96\xe0\xa5\x87\xe0\xa4\xb5\xe0\xa4\x9f \xe0\xa4\xaf\xe0\xa4\xbe \xe0\xa4\x9c\xe0\xa4\xae\xe0\xa4\xbe\xe0\xa4\xac\xe0\xa4\x82\xe0\xa4\xa6\xe0\xa5\x80 \xe0\xa4\xa8.  \xe0\xa4\x96\xe0\xa4\xa4\xe0\xa5\x8c\xe0\xa4\xa8\xe0\xa5\x80 \xe0\xa4\xa8.  ... \xe0\xa4\xae\xe0\xa4\xbe\xe0\xa4\xb2 \xe0\xa4\x94\xe0\xa4\xb0 \xe0\xa4\xb8\xe0\xa4\xb5\xe0\xa4\xbe\xe0\xa4\x88 \xe0\xa4\x95\xe0\xa5\x87 \xe0\xa4\xac\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa5\x8c\xe0\xa4\xb0\xe0\xa5\x87 \xe0\xa4\xb8\xe0\xa4\xb9\xe0\xa4\xbf\xe0\xa4\xa4 \xe0\xa4\xae\xe0\xa4\xbe\xe0\xa4\x82\xe0\xa4\x97            \xe0\xa4\x85\xe0\xa4\xad\xe0\xa4\xbf\xe0\xa4\xaf\xe0\xa5\x81\xe0\xa4\x95\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa4\xbf\n0                    7      10.0  ...                             NaN        \xe0\xa4\xac\xe0\xa4\xb0\xe0\xa5\x82\xe0\xa4\x8e \xe0\xa4\xb0\xe0\xa4\xaa\xe0\xa4\x9f \xe0\xa4\xa8. 1\n1                   //       NaN  ...                             NaN  \xe0\xa4\xa4\xe0\xa4\xbf\xe0\xa4\xa5\xe0\xa5\x80 26-09-2012 \xe0\xa4\xb0\xe0\xa4\x9c\xe0\xa4\xbf\n2                    7       NaN  ...                             NaN          \xe0\xa4\xa8.4173 \xe0\xa4\xa4\xe0\xa4\xbf\xe0\xa4\xa5\xe0\xa5\x80\n3                  NaN       NaN  ...                             NaN   24/09/2012 4:27:00\n4                  NaN       NaN  ...                             NaN     PM \xe0\xa4\x95\xe0\xa5\x87 \xe0\xa4\x85\xe0\xa4\xa8\xe0\xa5\x81\xe0\xa4\xb8\xe0\xa4\xbe\xe0\xa4\xb0 \xe0\xa4\xae\xe0\xa4\xbf\xe0\xa4\xa8\n..                 ...       ...  ...                             ...                  ...\n124                NaN       NaN  ...                             NaN                  NaN\n125                NaN       NaN  ...                             NaN                  NaN\n126                NaN       NaN  ...                             NaN                  NaN\n127                NaN       NaN  ...                             NaN                  NaN\n128                NaN       NaN  ...                             NaN                  NaN\n\n[129 rows x 12 columns]\n

Run Code Online (Sandbox Code Playgroud)\n

归档时间：	1 年，11 月前
查看次数：	329 次
最近记录：	1 年，10 月前