Web使用PyQt/QtWebkit搜索多个链接

tch*_*ore 2 python qt qt4 web-scraping

我正在试图抓取一个需要"雪球"方法的大型政府记录网站,即从主搜索页面开始,然后跟随刮板找到的每个链接到下一页.

我已经能够使用PyQt 这个SiteScraper教程加载主页.

import sys
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from BeautifulSoup import BeautifulSoup

class Render(QWebPage):
      def __init__(self, url):
           self.app = QApplication(sys.argv)
           QWebPage.__init__(self)
           self.loadFinished.connect(self._loadFinished)
           self.mainFrame().load(QUrl(url))
           self.app.exec_()

      def _loadFinished(self, result):
           self.frame = self.mainFrame()
           self.app.quit()

def main():
    baseUrl = 'http://www.thesite.gov'
    url = 'http://www.thesite.gov/search'
    r = Render(url)
    html = r.frame.toHtml()
    # use BeautifulSoup to cycle through each regulation
    soup = BeautifulSoup(html)

regs = soup.find('div',{'class':'x-grid3-body'}).findAll('a')

# cycle through list and call up each page separately
for reg in regs:
    link = baseUrl + reg['href']
    link = str(link)
    # use Qt to load each regulation page
    r = Render(link)

    html = r.frame.toHtml() # get actual rendered web page
Run Code Online (Sandbox Code Playgroud)

问题是当我尝试渲染新网页时出现此错误:

RuntimeError: A QApplication instance already exists.
Run Code Online (Sandbox Code Playgroud)

我知道该函数试图调用另一个QApplication实例.但是如何导航到具有相同实例的新页面?


class Render(QWebPage):
     def __init__(self, app, url):
          QWebPage.__init__(self)
          self.loadFinished.connect(self._loadFinished)
          self.mainFrame().load(QUrl(url))
          app.exec_()

     def _loadFinished(self, result):
          self.frame = self.mainFrame()

def main():
    app = QApplication(sys.argv)

    baseUrl = 'http://www.thesite.gov'
    url = 'http://www.thesite.gov/search'

    r = Render(app, url)
    html = r.frame.toHtml()
Run Code Online (Sandbox Code Playgroud)

小智 6

我有同样的问题(需要用QWebPage加载多个页面),但我无法得到任何这些答案对我有用.这是什么工作,关键是使用QEventLoop并将loadFinished连接到loop.quit:

from PySide import QtCore, QtGui, QtWebKit
import sys

def loadPage(url):
      page = QtWebKit.QWebPage()
      loop = QtCore.QEventLoop() # Create event loop
      page.mainFrame().loadFinished.connect(loop.quit) # Connect loadFinished to loop quit
      page.mainFrame().load(url)
      loop.exec_() # Run event loop, it will end on loadFinished
      return page.mainFrame().toHtml()

app = QtGui.QApplication(sys.argv)

urls = ['https://google.com', 'http://reddit.com', 'http://wikipedia.org']
for url in urls:
      print '-----------------------------------------------------'
      print 'Loading ' + url
      html = loadPage(url)
      print html

app.exit()
Run Code Online (Sandbox Code Playgroud)

与OP相比,这里发布了一个简化示例,以演示基本问题和解决方案.