inv*_*101 4 python scrapy web-scraping
所以我正在做一个项目,使用多个蜘蛛抓取不同的网站。我想让蜘蛛在用户要求继续时说“是”时再次运行。
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
Run Code Online (Sandbox Code Playgroud)
但我收到一条错误消息,说反应堆无法重新启动。
Traceback (most recent call last):
File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
process.start()
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
ReactorBase.startRunning(cast(ReactorBase, self))
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
Run Code Online (Sandbox Code Playgroud)
所以我想使用 while 循环是行不通的。我什至不知道从哪里开始......
方法一:
scrapy创建Reactor后不能重用stop,但如果您将在单独的进程中运行Crawler,则新进程将必须创建新的Reactor.
import multiprocessing
def run_crawler(keyword, page_range):
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
# --- main ---
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
p = multiprocessing(target=run_crawler, args=(keyword, page_range))
p.start()
p.join()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
Run Code Online (Sandbox Code Playgroud)
threading如果您使用而不是,它将不起作用,multiprocessing因为线程共享变量,因此新线程将使用与前一个线程相同的变量Reactor。
最少的工作代码(在 Linux 上测试)。
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
import multiprocessing
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page_range):
#from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
p.start()
p.join()
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
Run Code Online (Sandbox Code Playgroud)
方法二:
在 Google 中找到:重新启动 Twisted Reactor。
这是旧帖子,用于从内存中del删除模块twisted,然后再次使用imports。
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Run Code Online (Sandbox Code Playgroud)
最少的工作代码(在 Linux 上测试)
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page):
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
run_crawler(keyword, page)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Run Code Online (Sandbox Code Playgroud)
方法三:
看来你可以使用CrawlRunner代替CrawlProcess- 但我还没有测试它。
基于文档中在同一进程中运行多个蜘蛛的最后一个示例,我创建了while在反应器内运行 -loop 的代码(因此不必停止它),但它首先启动一个蜘蛛,接下来运行第二个蜘蛛,接下来它要求继续,它再次运行第一个蜘蛛,接下来运行第二个蜘蛛。它不会同时运行两个蜘蛛,但也许可以以某种方式进行更改。
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
@defer.inlineCallbacks
def run_crawler():
running = True
while running:
yield runner.crawl(MySpider, keyword, page)
yield runner.crawl(MySpider, keyword, int(page)+1)
yield runner.crawl(MySpider, keyword, int(page)+2)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
configure_logging()
runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
run_crawler()
reactor.run()
Run Code Online (Sandbox Code Playgroud)
编辑:
相同但现在所有爬虫同时运行
@defer.inlineCallbacks
def run_crawler():
running = True
while running:
runner.crawl(MySpider, keyword, page)
runner.crawl(MySpider, keyword, int(page)+1)
runner.crawl(MySpider, keyword, int(page)+2)
d = runner.join()
yield d
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
1462 次 |
| 最近记录: |