如何告诉 python scrapy 移动到下一个起始 URL

Question

如何告诉 python scrapy 移动到下一个起始 URL

我编写了一个 scrapy 蜘蛛，它有许多 start_url 并在这些 url 中提取电子邮件地址。该脚本需要很长时间才能执行，因此我想告诉 Scrapy 在发现电子邮件时停止抓取特定网站并移至下一个网站。

编辑：添加代码

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
import csv
from urlparse import urlparse

from entreprise.items import MailItem

class MailSpider(CrawlSpider):
    name = "mail"
    start_urls = []
    allowed_domains = []
    with open('scraped_data.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)
        for row in reader:
            url = row[5].strip()
            if (url.strip() != ""):
                start_urls.append(url)
                fragments = urlparse(url).hostname.split(".")
                hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:])
                allowed_domains.append(hostname)

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)
        return items

Run Code Online (Sandbox Code Playgroud)

Answer 1

ale*_*cxe 2

这个想法是使用start_requests方法来决定接下来要抓取哪些 url。此外，我们将跟踪电子邮件是否被解析为parsed_hostnames类级别集中的主机名。

另外，我现在使用urlparse更改了从 url 获取主机名的方式。

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse


class MailItem(Item):
    url = Field()
    mail = Field()


class MailSpider(CrawlSpider):
    name = "mail"

    parsed_hostnames= set()
    allowed_domains = []

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def start_requests(self):
        with open('scraped_data.csv', 'rb') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            next(reader)

            for row in reader:
                url = row[5].strip()
                if url:
                    hostname = urlparse(url).hostname
                    if hostname not in self.parsed_hostnames:
                        if hostname not in self.allowed_domains:
                            self.allowed_domains.append(hostname)
                            self.rules[0].link_extractor.allow_domains.add(hostname)
                            self.rules[1].link_extractor.allow_domains.add(hostname)

                        yield self.make_requests_from_url(url)
                    else:
                        self.allowed_domains.remove(hostname)
                        self.rules[0].link_extractor.allow_domains.remove(hostname)
                        self.rules[1].link_extractor.allow_domains.remove(hostname)

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)

        hostname = urlparse(response.url).hostname
        self.parsed_hostnames.add(hostname)

        return items

Run Code Online (Sandbox Code Playgroud)

理论上应该有效。希望有帮助。

归档时间：	12 年，5 月前
查看次数：	3676 次
最近记录：	12 年，5 月前