如何将抓取的项目放入 Pyqt5 小部件中?

Art*_*iom 3 python pyqt scrapy pyqt5

我正在尝试为 Scrapy 爬虫制作一个简单的 GUI,用户可以按“开始”按钮来运行抓取并在 textBrowser (或其他 qt 小部件,请告知)中查看抓取的结果。

我的蜘蛛:

import scrapy, json


class CarSpider(scrapy.Spider):
    name = 'car'
    start_urls = ["https://www.target-website.com/"]

    def parse(self, response):
        """some code """
            yield scrapy.Request(url=url, callback=self.parse_page)

    def parse_page(self, response):
        items = json.loads(response.body_as_unicode())['items']
        for i in items:
            ...
            scraped_item = {
                'Make': make,
                'Model': model,
                'Year': year,                    
            }
            yield scraped_item
Run Code Online (Sandbox Code Playgroud)

应用程序设计是在 Qt Designer 中完成的:

图形用户界面:

from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(801, 612)
        sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
        sizePolicy.setHorizontalStretch(0)
        sizePolicy.setVerticalStretch(0)
        sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
        MainWindow.setSizePolicy(sizePolicy)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
        sizePolicy.setHorizontalStretch(0)
        sizePolicy.setVerticalStretch(0)
        sizePolicy.setHeightForWidth(self.centralwidget.sizePolicy().hasHeightForWidth())
        self.centralwidget.setSizePolicy(sizePolicy)
        self.centralwidget.setObjectName("centralwidget")
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setGeometry(QtCore.QRect(10, 10, 211, 41))
        self.pushButton.setObjectName("pushButton")
        self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_2.setEnabled(False)
        self.pushButton_2.setGeometry(QtCore.QRect(10, 60, 211, 41))
        self.pushButton_2.setObjectName("pushButton_2")
        self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
        self.textBrowser.setGeometry(QtCore.QRect(240, 10, 551, 571))
        self.textBrowser.setObjectName("textBrowser")
        MainWindow.setCentralWidget(self.centralwidget)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.pushButton.setText(_translate("MainWindow", "Run Scraper"))
        self.pushButton_2.setText(_translate("MainWindow", "Stop"))
Run Code Online (Sandbox Code Playgroud)

这是我尝试编写的处理数据的代码:

数据处理程序.py:

from PyQt5 import QtWidgets
from PyQt5.QtCore import pyqtSignal, QThread
from my_gui import Ui_MainWindow 
import sys, os 
import subprocess


class SpiderThread(QThread):
    signal = pyqtSignal()
    output_signal = pyqtSignal('PyQt_PyObject')

    def __init__(self):
        QThread.__init__(self)

    def __del__(self):
        self.wait()

    def run(self):
        if os.path.exists('result.csv'):
            os.remove('result.csv')
        cmd = "scrapy crawl car"
        proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
        self.proc_id = proc.pid
        print(self.proc_id)
        out = proc.communicate()
        for line in out:
            self.output_signal.emit(line)
        self.signal.emit()


class mywindow(QtWidgets.QMainWindow):
    def __init__(self):
        super(mywindow, self).__init__()
        self.ui = Ui_MainWindow()
        self.ui.setupUi(self)
        self.ui.pushButton.clicked.connect(self.slot_method)

        self.crawling_thread = SpiderThread()
        self.crawling_thread.signal.connect(self.finished)
        self.crawling_thread.output_signal.connect(self.update_text)
        self.ui.pushButton_2.clicked.connect(self.stop)

    def slot_method(self):
        self.ui.pushButton.setEnabled(False)
        self.ui.pushButton_2.setEnabled(True)
        self.ui.textBrowser.setText('')
        self.ui.textBrowser.append('started scraping...')
        self.crawling_thread.start()

    def finished(self):
        self.ui.textBrowser.append('finished scraping')  # Show the output to the user
        self.ui.pushButton.setEnabled(True)  # Enable the pushButton
        self.ui.pushButton_2.setEnabled(False)

    def update_text(self, signal):
        self.ui.textBrowser.append(signal.decode("utf-8"))

    def stop(self):
        print(self.crawling_thread.proc_id)
        os.kill(self.crawling_thread.proc_id)
        self.ui.textBrowser.append('Scraping stopped...')
        self.ui.pushButton.setEnabled(True)  # Enable the pushButton
        self.ui.pushButton_2.setEnabled(False)


def main():
    app = QtWidgets.QApplication([])
    application = mywindow()
    application.show()
    sys.exit(app.exec())


if __name__ == '__main__':
    main()
Run Code Online (Sandbox Code Playgroud)

使用此代码,我只能获取as 文本,并仅在完成抓取后将stdout其放入。textBrowser如果抓取需要 20-30 分钟 - 我看不到textBrowser. 有没有机会获得抓取的物品并实时显示它们?也许有一个解决方案可以用第二个按钮停止/暂停抓取过程?

eyl*_*esc 5

您应该使用 s ubproces.Popen() + QThread,而不是使用 s ubproces.Popen() + QThread QProcess,因为通过信号通知您任务会更容易。

\n\n

我创建了一个应用程序,它扫描项目中的所有蜘蛛,在 QComboBox 中显示它们,您可以在其中选择要运行的蜘蛛,然后有一个按钮,允许您通过在 QTextBrowser 中显示日志来启动或停止应用程序。

\n\n

假设scrapy项目具有以下结构(该项目是scrapy的一个示例,你可以在这里找到它):

\n\n
tutorial\n\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 scrapy.cfg\n\xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 tutorial\n    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 __init__.py\n    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 items.py\n    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 middlewares.py\n    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 pipelines.py\n    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 settings.py\n    \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 spiders\n        \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 __init__.py\n        \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 toscrape-css.py\n        \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 toscrape-xpath.py\n
Run Code Online (Sandbox Code Playgroud)\n\n

用户必须选择 .cfg 文件,这将显示可用的蜘蛛,然后根据需要按启动/停止按钮。

\n\n
from functools import partial\nfrom PyQt5 import QtCore, QtGui, QtWidgets\n\nclass ScrapyWorker(QtCore.QObject):\n    logChanged = QtCore.pyqtSignal(str)\n    started = QtCore.pyqtSignal()\n    finished = QtCore.pyqtSignal()\n\n    def __init__(self, parent=None):\n        super(ScrapyWorker, self).__init__(parent)\n        self._process = QtCore.QProcess(self)\n        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)\n        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)\n        self._process.setProgram(\'scrapy\')\n        self._process.started.connect(self.started)\n        self._process.finished.connect(self.finished)\n\n    def run(self, project, spider):\n        self._process.setWorkingDirectory(project)\n        self._process.setArguments([\'crawl\', spider])\n        self._process.start()\n\n    @QtCore.pyqtSlot()\n    def on_readyReadStandardOutput(self):\n        data = self._process.readAllStandardOutput().data().decode()\n        self.logChanged.emit(data)\n\n    @QtCore.pyqtSlot()\n    def stop(self):\n        self._process.kill()\n\n    def spiders(self, project):\n        process = QtCore.QProcess()\n        process.setProcessChannelMode(QtCore.QProcess.MergedChannels)\n        process.setWorkingDirectory(project)\n        loop = QtCore.QEventLoop()\n        process.finished.connect(loop.quit)\n        process.start(\'scrapy\', [\'list\'])\n        loop.exec_()\n        return process.readAllStandardOutput().data().decode().split()\n\nclass MainWindow(QtWidgets.QMainWindow):\n    def __init__(self, parent=None):\n        super(MainWindow, self).__init__(parent)\n\n        self.project_le = QtWidgets.QLineEdit()\n        self.project_button = QtWidgets.QPushButton(\'Select Project\')\n        self.spider_combobox = QtWidgets.QComboBox()\n        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)\n        self.text_edit = QtWidgets.QTextBrowser()\n        central_widget = QtWidgets.QWidget()\n        self.setCentralWidget(central_widget)\n\n        lay = QtWidgets.QVBoxLayout(central_widget)\n        hlay = QtWidgets.QHBoxLayout()\n        hlay.addWidget(self.project_le)\n        hlay.addWidget(self.project_button)\n        lay.addLayout(hlay)\n        hlay2 = QtWidgets.QHBoxLayout()\n        hlay2.addWidget(QtWidgets.QLabel("spiders:"))\n        hlay2.addWidget(self.spider_combobox, 1)\n        lay.addLayout(hlay2)\n        lay.addWidget(self.start_stop_button)\n        lay.addWidget(self.text_edit)\n\n        self.start_stop_button.setEnabled(False)\n\n        self.scrapy_worker = ScrapyWorker(self)\n        self.scrapy_worker.logChanged.connect(self.insert_log)\n        self.scrapy_worker.started.connect(self.text_edit.clear)\n        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))\n\n        self.start_stop_button.toggled.connect(self.on_checked)\n        self.project_button.clicked.connect(self.select_project)\n        self.resize(640, 480)\n\n    @QtCore.pyqtSlot(bool)\n    def on_checked(self, state):\n        if state:\n            filename = self.project_le.text()\n            finfo = QtCore.QFileInfo(filename)\n            directory = finfo.dir().absolutePath()\n            self.scrapy_worker.run(directory, self.spider_combobox.currentText())\n            self.start_stop_button.setText(\'Stop\')\n        else:\n            self.start_stop_button.setText(\'Start\')\n            self.scrapy_worker.stop()\n\n    @QtCore.pyqtSlot()\n    def select_project(self):\n        filename, _ = QtWidgets.QFileDialog.getOpenFileName(\n            self,\n            "Select .cfg file",\n            QtCore.QDir.currentPath(),\n            "Configure File (*.cfg)"\n        )\n        if filename:\n            self.project_le.setText(filename)\n            finfo = QtCore.QFileInfo(filename)\n            directory = finfo.dir().absolutePath()\n            spiders = self.scrapy_worker.spiders(directory)\n            self.spider_combobox.clear()\n            self.spider_combobox.addItems(spiders)\n            self.start_stop_button.setEnabled(True if spiders else False)\n\n    @QtCore.pyqtSlot(str)\n    def insert_log(self, text):\n        prev_cursor = self.text_edit.textCursor()\n        self.text_edit.moveCursor(QtGui.QTextCursor.End)\n        self.text_edit.insertPlainText(text)\n        self.text_edit.setTextCursor(prev_cursor)\n\nif __name__ == \'__main__\':\n    import sys\n    app = QtWidgets.QApplication(sys.argv)\n    app.setStyle(\'fusion\')\n    w = MainWindow()\n    w.show()\n    sys.exit(app.exec_())\n
Run Code Online (Sandbox Code Playgroud)\n\n

输出:

\n\n

在此输入图像描述

\n