Art*_*iom 3 python pyqt scrapy pyqt5
我正在尝试为 Scrapy 爬虫制作一个简单的 GUI,用户可以按“开始”按钮来运行抓取并在 textBrowser (或其他 qt 小部件,请告知)中查看抓取的结果。
import scrapy, json
class CarSpider(scrapy.Spider):
name = 'car'
start_urls = ["https://www.target-website.com/"]
def parse(self, response):
"""some code """
yield scrapy.Request(url=url, callback=self.parse_page)
def parse_page(self, response):
items = json.loads(response.body_as_unicode())['items']
for i in items:
...
scraped_item = {
'Make': make,
'Model': model,
'Year': year,
}
yield scraped_item
Run Code Online (Sandbox Code Playgroud)
应用程序设计是在 Qt Designer 中完成的:
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(801, 612)
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
MainWindow.setSizePolicy(sizePolicy)
self.centralwidget = QtWidgets.QWidget(MainWindow)
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(self.centralwidget.sizePolicy().hasHeightForWidth())
self.centralwidget.setSizePolicy(sizePolicy)
self.centralwidget.setObjectName("centralwidget")
self.pushButton = QtWidgets.QPushButton(self.centralwidget)
self.pushButton.setGeometry(QtCore.QRect(10, 10, 211, 41))
self.pushButton.setObjectName("pushButton")
self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_2.setEnabled(False)
self.pushButton_2.setGeometry(QtCore.QRect(10, 60, 211, 41))
self.pushButton_2.setObjectName("pushButton_2")
self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
self.textBrowser.setGeometry(QtCore.QRect(240, 10, 551, 571))
self.textBrowser.setObjectName("textBrowser")
MainWindow.setCentralWidget(self.centralwidget)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
self.pushButton.setText(_translate("MainWindow", "Run Scraper"))
self.pushButton_2.setText(_translate("MainWindow", "Stop"))
Run Code Online (Sandbox Code Playgroud)
这是我尝试编写的处理数据的代码:
from PyQt5 import QtWidgets
from PyQt5.QtCore import pyqtSignal, QThread
from my_gui import Ui_MainWindow
import sys, os
import subprocess
class SpiderThread(QThread):
signal = pyqtSignal()
output_signal = pyqtSignal('PyQt_PyObject')
def __init__(self):
QThread.__init__(self)
def __del__(self):
self.wait()
def run(self):
if os.path.exists('result.csv'):
os.remove('result.csv')
cmd = "scrapy crawl car"
proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
self.proc_id = proc.pid
print(self.proc_id)
out = proc.communicate()
for line in out:
self.output_signal.emit(line)
self.signal.emit()
class mywindow(QtWidgets.QMainWindow):
def __init__(self):
super(mywindow, self).__init__()
self.ui = Ui_MainWindow()
self.ui.setupUi(self)
self.ui.pushButton.clicked.connect(self.slot_method)
self.crawling_thread = SpiderThread()
self.crawling_thread.signal.connect(self.finished)
self.crawling_thread.output_signal.connect(self.update_text)
self.ui.pushButton_2.clicked.connect(self.stop)
def slot_method(self):
self.ui.pushButton.setEnabled(False)
self.ui.pushButton_2.setEnabled(True)
self.ui.textBrowser.setText('')
self.ui.textBrowser.append('started scraping...')
self.crawling_thread.start()
def finished(self):
self.ui.textBrowser.append('finished scraping') # Show the output to the user
self.ui.pushButton.setEnabled(True) # Enable the pushButton
self.ui.pushButton_2.setEnabled(False)
def update_text(self, signal):
self.ui.textBrowser.append(signal.decode("utf-8"))
def stop(self):
print(self.crawling_thread.proc_id)
os.kill(self.crawling_thread.proc_id)
self.ui.textBrowser.append('Scraping stopped...')
self.ui.pushButton.setEnabled(True) # Enable the pushButton
self.ui.pushButton_2.setEnabled(False)
def main():
app = QtWidgets.QApplication([])
application = mywindow()
application.show()
sys.exit(app.exec())
if __name__ == '__main__':
main()
Run Code Online (Sandbox Code Playgroud)
使用此代码,我只能获取as 文本,并仅在完成抓取后将stdout其放入。textBrowser如果抓取需要 20-30 分钟 - 我看不到textBrowser. 有没有机会获得抓取的物品并实时显示它们?也许有一个解决方案可以用第二个按钮停止/暂停抓取过程?
您应该使用 s ubproces.Popen() + QThread,而不是使用 s ubproces.Popen() + QThread QProcess,因为通过信号通知您任务会更容易。
我创建了一个应用程序,它扫描项目中的所有蜘蛛,在 QComboBox 中显示它们,您可以在其中选择要运行的蜘蛛,然后有一个按钮,允许您通过在 QTextBrowser 中显示日志来启动或停止应用程序。
\n\n假设scrapy项目具有以下结构(该项目是scrapy的一个示例,你可以在这里找到它):
\n\ntutorial\n\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 scrapy.cfg\n\xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 tutorial\n \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 __init__.py\n \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 items.py\n \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 middlewares.py\n \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 pipelines.py\n \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 settings.py\n \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 spiders\n \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 __init__.py\n \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 toscrape-css.py\n \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 toscrape-xpath.py\nRun Code Online (Sandbox Code Playgroud)\n\n用户必须选择 .cfg 文件,这将显示可用的蜘蛛,然后根据需要按启动/停止按钮。
\n\nfrom functools import partial\nfrom PyQt5 import QtCore, QtGui, QtWidgets\n\nclass ScrapyWorker(QtCore.QObject):\n logChanged = QtCore.pyqtSignal(str)\n started = QtCore.pyqtSignal()\n finished = QtCore.pyqtSignal()\n\n def __init__(self, parent=None):\n super(ScrapyWorker, self).__init__(parent)\n self._process = QtCore.QProcess(self)\n self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)\n self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)\n self._process.setProgram(\'scrapy\')\n self._process.started.connect(self.started)\n self._process.finished.connect(self.finished)\n\n def run(self, project, spider):\n self._process.setWorkingDirectory(project)\n self._process.setArguments([\'crawl\', spider])\n self._process.start()\n\n @QtCore.pyqtSlot()\n def on_readyReadStandardOutput(self):\n data = self._process.readAllStandardOutput().data().decode()\n self.logChanged.emit(data)\n\n @QtCore.pyqtSlot()\n def stop(self):\n self._process.kill()\n\n def spiders(self, project):\n process = QtCore.QProcess()\n process.setProcessChannelMode(QtCore.QProcess.MergedChannels)\n process.setWorkingDirectory(project)\n loop = QtCore.QEventLoop()\n process.finished.connect(loop.quit)\n process.start(\'scrapy\', [\'list\'])\n loop.exec_()\n return process.readAllStandardOutput().data().decode().split()\n\nclass MainWindow(QtWidgets.QMainWindow):\n def __init__(self, parent=None):\n super(MainWindow, self).__init__(parent)\n\n self.project_le = QtWidgets.QLineEdit()\n self.project_button = QtWidgets.QPushButton(\'Select Project\')\n self.spider_combobox = QtWidgets.QComboBox()\n self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)\n self.text_edit = QtWidgets.QTextBrowser()\n central_widget = QtWidgets.QWidget()\n self.setCentralWidget(central_widget)\n\n lay = QtWidgets.QVBoxLayout(central_widget)\n hlay = QtWidgets.QHBoxLayout()\n hlay.addWidget(self.project_le)\n hlay.addWidget(self.project_button)\n lay.addLayout(hlay)\n hlay2 = QtWidgets.QHBoxLayout()\n hlay2.addWidget(QtWidgets.QLabel("spiders:"))\n hlay2.addWidget(self.spider_combobox, 1)\n lay.addLayout(hlay2)\n lay.addWidget(self.start_stop_button)\n lay.addWidget(self.text_edit)\n\n self.start_stop_button.setEnabled(False)\n\n self.scrapy_worker = ScrapyWorker(self)\n self.scrapy_worker.logChanged.connect(self.insert_log)\n self.scrapy_worker.started.connect(self.text_edit.clear)\n self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))\n\n self.start_stop_button.toggled.connect(self.on_checked)\n self.project_button.clicked.connect(self.select_project)\n self.resize(640, 480)\n\n @QtCore.pyqtSlot(bool)\n def on_checked(self, state):\n if state:\n filename = self.project_le.text()\n finfo = QtCore.QFileInfo(filename)\n directory = finfo.dir().absolutePath()\n self.scrapy_worker.run(directory, self.spider_combobox.currentText())\n self.start_stop_button.setText(\'Stop\')\n else:\n self.start_stop_button.setText(\'Start\')\n self.scrapy_worker.stop()\n\n @QtCore.pyqtSlot()\n def select_project(self):\n filename, _ = QtWidgets.QFileDialog.getOpenFileName(\n self,\n "Select .cfg file",\n QtCore.QDir.currentPath(),\n "Configure File (*.cfg)"\n )\n if filename:\n self.project_le.setText(filename)\n finfo = QtCore.QFileInfo(filename)\n directory = finfo.dir().absolutePath()\n spiders = self.scrapy_worker.spiders(directory)\n self.spider_combobox.clear()\n self.spider_combobox.addItems(spiders)\n self.start_stop_button.setEnabled(True if spiders else False)\n\n @QtCore.pyqtSlot(str)\n def insert_log(self, text):\n prev_cursor = self.text_edit.textCursor()\n self.text_edit.moveCursor(QtGui.QTextCursor.End)\n self.text_edit.insertPlainText(text)\n self.text_edit.setTextCursor(prev_cursor)\n\nif __name__ == \'__main__\':\n import sys\n app = QtWidgets.QApplication(sys.argv)\n app.setStyle(\'fusion\')\n w = MainWindow()\n w.show()\n sys.exit(app.exec_())\nRun Code Online (Sandbox Code Playgroud)\n\n输出:
\n\n\n