Scrapy：如何使用元在方法之间传递项目

Question

Scrapy：如何使用元在方法之间传递项目

我是 scrapy 和 python 的新手，我试图将 parse_quotes 中的项目 item['author'] 传递给下一个解析方法 parse_bio

我尝试了 request.meta 和 response.meta 方法，如 scrapy 文档中所示，但没有成功。请参阅下面的代码。

import scrapy
from tutorial.items import QuotesItem

class QuotesSpider(scrapy.Spider): 

name = "quotes"

    start_urls = [
        'http://quotes.toscrape.com/login',
        #'http://quotes.toscrape.com/page/2',
    ]

    # Scraping a site with login
    # Important: Cookie settings must be "True" to keep the login session alive

    custom_settings = {'COOKIES_ENABLED': True}

    def parse(self, response):
        return scrapy.FormRequest.from_response(
            response,
            formdata={'username': 'john', 'password': 'secret'},
            callback=self.parse_quotes
            )
    
    
    def parse_quotes(self, response):
        for sel in response.css('div.quote'):
            item = QuotesItem()
            item['text'] = sel.css('span.text::text').get()
            item['author'] = sel.css('small.author::text').get()
            item['tags'] = sel.css('div.tags a.tag::text').getall()
            item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
            item['author_bio_link'] = sel.css('.author + a')
            yield item
        
            # follow the detail links @ shortcut
            # vertical crawling
            for a in item['author_bio_link']:
                yield response.follow(a, callback = self.parse_bio)
                   
    def parse_bio(self, response):
        item = QuotesItem()
        item['author_born'] = response.css('p span::text').getall()
        item['author_born'] = item['author_born'][:2]
        item['author_bio'] = response.css('div.author-description ::text').get().strip()
        yield item
             
        # follow pagination links @ shortcut
        # horizontal crawling
    
        for a in response.css('li.next a'):
            yield response.follow(a, callback = self.parse_quotes)

Run Code Online (Sandbox Code Playgroud)

我希望从传递给 parse_bio 的 parse_quotes 获取 item['author']

Answer 1

vez*_*hik 8

meta我建议你这样使用：

def parse_quotes(self, response):
    for sel in response.css('div.quote'):
        item = QuotesItem()
        item['text'] = sel.css('span.text::text').get()
        item['author'] = sel.css('small.author::text').get()
        item['tags'] = sel.css('div.tags a.tag::text').getall()
        item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
        item['author_bio_link'] = sel.css('.author + a')
        yield item

        # follow the detail links @ shortcut
        # vertical crawling
        for a in item['author_bio_link']:
            yield response.follow(a, self.parse_bio, 
                                  meta={'author': item['author']})  # <- you set it here

def parse_bio(self, response):
    item = QuotesItem()
    item['author_born'] = response.css('p span::text').getall()
    item['author_born'] = item['author_born'][:2]
    item['author_data'] = response.meta.get('author')  # <- you get it here
    item['author_bio'] = response.css('div.author-description ::text').get().strip()
    yield item

    # follow pagination links @ shortcut
    # horizontal crawling

    for a in response.css('li.next a'):
        yield response.follow(a, callback = self.parse_quotes)

Run Code Online (Sandbox Code Playgroud)

归档时间：	6 年，9 月前
查看次数：	6136 次
最近记录：	4 年，3 月前