如何在 while 循环中运行 Scrapy

inv*_*101 4 python scrapy web-scraping

所以我正在做一个项目,使用多个蜘蛛抓取不同的网站。我想让蜘蛛在用户要求继续时说“是”时再次运行。

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False
Run Code Online (Sandbox Code Playgroud)

但我收到一条错误消息,说反应堆无法重新启动。

Traceback (most recent call last):
  File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
    process.start()
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
    reactor.run(installSignalHandlers=False)  # blocking call
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
    self.startRunning(installSignalHandlers=installSignalHandlers)
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
    ReactorBase.startRunning(cast(ReactorBase, self))
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
    raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
Run Code Online (Sandbox Code Playgroud)

所以我想使用 while 循环是行不通的。我什至不知道从哪里开始......

fur*_*ras 5

方法一:

scrapy创建Reactor后不能重用stop,但如果您将在单独的进程中运行Crawler,则新进程将必须创建新的Reactor.

import multiprocessing

def run_crawler(keyword, page_range):
   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

# --- main ---

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   p = multiprocessing(target=run_crawler, args=(keyword, page_range))
   p.start()
   p.join()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False
Run Code Online (Sandbox Code Playgroud)

threading如果您使用而不是,它将不起作用,multiprocessing因为线程共享变量,因此新线程将使用与前一个线程相同的变量Reactor


最少的工作代码(在 Linux 上测试)。

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

import multiprocessing
from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page_range):
    #from scrapy.crawler import CrawlerProcess

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()
    
# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")
        
    running = True
    while running:

        p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
        p.start()
        p.join()
        
        answer = input('Repeat [Y/n]? ').strip().lower()
        
        if answer == 'n':
            running = False
Run Code Online (Sandbox Code Playgroud)

方法二:

在 Google 中找到:重新启动 Twisted Reactor

这是旧帖子,用于从内存中del删除模块twisted,然后再次使用imports

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False
           
   import sys
   del sys.modules['twisted.internet.reactor']
   from twisted.internet import reactor
   from twisted.internet import default
   default.install()                  
Run Code Online (Sandbox Code Playgroud)

最少的工作代码(在 Linux 上测试)

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page):

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()
    
# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")
        
    running = True
    while running:
    
        run_crawler(keyword, page)
        
        answer = input('Repeat [Y/n]? ').strip().lower()
        
        if answer == 'n':
            running = False
            
        import sys
        del sys.modules['twisted.internet.reactor']
        from twisted.internet import reactor
        from twisted.internet import default
        default.install()            
Run Code Online (Sandbox Code Playgroud)

方法三:

看来你可以使用CrawlRunner代替CrawlProcess- 但我还没有测试它。

基于文档中在同一进程中运行多个蜘蛛的最后一个示例,我创建了while在反应器内运行 -loop 的代码(因此不必停止它),但它首先启动一个蜘蛛,接下来运行第二个蜘蛛,接下来它要求继续,它再次运行第一个蜘蛛,接下来运行第二个蜘蛛。它不会同时运行两个蜘蛛,但也许可以以某种方式进行更改。

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:

        yield runner.crawl(MySpider, keyword, page)
        yield runner.crawl(MySpider, keyword, int(page)+1)
        yield runner.crawl(MySpider, keyword, int(page)+2)

        answer = input('Repeat [Y/n]? ').strip().lower()
    
        if answer == 'n':
            running = False
            reactor.stop()
            #return

# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")

    configure_logging()        
    
    runner = CrawlerRunner({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })

    run_crawler()

    reactor.run()     
Run Code Online (Sandbox Code Playgroud)

编辑:

相同但现在所有爬虫同时运行

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:
    
        runner.crawl(MySpider, keyword, page)
        runner.crawl(MySpider, keyword, int(page)+1)
        runner.crawl(MySpider, keyword, int(page)+2)
        
        d = runner.join()
        yield d

        answer = input('Repeat [Y/n]? ').strip().lower()
    
        if answer == 'n':
            running = False
            reactor.stop()
            #return
Run Code Online (Sandbox Code Playgroud)