如何使用PyQt5的QWebEngineView"渲染"HTML

Six*_*Six 10 python pyqt pyqt5

如何使用PyQt5 v5.6 QWebEngineView"渲染"HTML?

我之前使用PyQt5 v5.4.1 QWebPage执行了该任务,但建议尝试使用较新的QWebEngineView.

这是实现(它通常按预期工作,但有一些站点和情况无限期挂起):

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebKitWidgets import QWebPage

    class Render(QWebPage):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebPage.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.mainFrame().setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            self.html = self.mainFrame().toHtml()
            self.app.quit()

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
Run Code Online (Sandbox Code Playgroud)

以下是我尝试使用QWebEngineView.首先,在Ubuntu上安装和设置PyQt5 v5.6:

# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5

# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/
Run Code Online (Sandbox Code Playgroud)

现在对于Python ...以下导致分段错误:

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            # what's going on here? how can I get the HTML from toHtml?
            self.page().toHtml(self.callable)
            self.app.quit()

        def callable(self, data):
            self.html = data

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
Run Code Online (Sandbox Code Playgroud)

问题似乎在于对异步的调用toHtml().它似乎应该相当简单,但我对它的处理方式感到茫然.我看到它已在C++的上下文中讨论过,但我不确定如何将其转换为Python.如何获取HTML?

Six*_*Six 8

在以下主题中对该主题进行了相当多的讨论:https://riverbankcomputing.com/pipermail/pyqt/20​​15-January/035324.html

新的QWebEngine接口考虑了底层Chromium引擎是异步的这一事实.因此,我们必须将异步API转换为同步API.

这是看起来如何:

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtCore import QEventLoop
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            while self.html is None:
                self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
            self.app.quit()

        def _callable(self, data):
            self.html = data

        def _loadFinished(self, result):
            self.page().toHtml(self._callable)

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
Run Code Online (Sandbox Code Playgroud)


小智 6

Six&Veehmot的答案很好,但我发现就我的目的而言这还不够,因为它没有扩展我要抓取的页面的下拉菜单元素。稍作修改即可解决此问题:

def render(url):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtCore import QEventLoop,QUrl
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, url):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.load(QUrl(url))
            while self.html is None:
                self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
            self.app.quit()

        def _callable(self, data):
            self.html = data

        def _loadFinished(self, result):
            self.page().toHtml(self._callable)

    return Render(url).html


print(render(dummy_url))
Run Code Online (Sandbox Code Playgroud)


Jor*_*jon 5

正如您所指出的,Qt5.4依赖于异步调用。不必使用循环(如您的答案所示),因为您唯一的错误是quittoHtml通话结束之前通话。

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            # This is an async call, you need to wait for this
            # to be called before closing the app
            self.page().toHtml(self.callable)

        def callable(self, data):
            self.html = data
            # Data has been stored, it's safe to quit the app
            self.app.quit()

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
Run Code Online (Sandbox Code Playgroud)