xlm*_*ter 5 python caching scrapy slowdown
我有一个刮刀机器人,效果很好。但随着时间的推移,刮擦时速度会下降。我添加了concurrent request, download_delay:0,'AUTOTHROTTLE_ENABLED':False但结果是一样的。它开始时速度很快,但速度会变慢。我想这与缓存有关,但不知道我是否必须清理缓存,或者为什么会这样?代码如下希望听到评论;
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import scrapy_xlsx
itemList=[]
class plateScraper(scrapy.Spider):
name = 'scrapePlate'
allowed_domains = ['dvlaregistrations.dvla.gov.uk']
FEED_EXPORTERS = {'xlsx': 'scrapy_xlsx.XlsxItemExporter'}
custom_settings = {'FEED_EXPORTERS' :FEED_EXPORTERS,'FEED_FORMAT': 'xlsx','FEED_URI': 'output_r00.xlsx', 'LOG_LEVEL':'INFO','DOWNLOAD_DELAY': 0,'CONCURRENT_ITEMS':300,'CONCURRENT_REQUESTS':30,'AUTOTHROTTLE_ENABLED':False}
def start_requests(self):
df=pd.read_excel('data.xlsx')
columnA_values=df['PLATE']
for row in columnA_values:
global plate_num_xlsx
plate_num_xlsx=row
base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=¤tmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
url=base_url
yield scrapy.Request(url,callback=self.parse, cb_kwargs={'plate_num_xlsx': plate_num_xlsx})
def parse(self, response, plate_num_xlsx=None):
plate = response.xpath('//div[@class="resultsstrip"]/a/text()').extract_first()
price = response.xpath('//div[@class="resultsstrip"]/p/text()').extract_first()
try:
a = plate.replace(" ", "").strip()
if plate_num_xlsx == plate.replace(" ", "").strip():
item = {"plate": plate_num_xlsx, "price": price.strip()}
itemList.append(item)
print(item)
yield item
else:
item = {"plate": plate_num_xlsx, "price": "-"}
itemList.append(item)
print(item)
yield item
except:
item = {"plate": plate_num_xlsx, "price": "-"}
itemList.append(item)
print(item)
yield item
process = CrawlerProcess()
process.crawl(plateScraper)
process.start()
import winsound
winsound.Beep(555,333)
Run Code Online (Sandbox Code Playgroud)
编辑:“日志统计”
{'downloader/request_bytes': 1791806,
'downloader/request_count': 3459,
'downloader/request_method_count/GET': 3459,
'downloader/response_bytes': 38304184,
'downloader/response_count': 3459,
'downloader/response_status_count/200': 3459,
'dupefilter/filtered': 6,
'elapsed_time_seconds': 3056.810985,
'feedexport/success_count/FileFeedStorage': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 1, 27, 22, 31, 17, 17188),
'httpcompression/response_bytes': 238767410,
'httpcompression/response_count': 3459,
'item_scraped_count': 3459,
'log_count/INFO': 61,
'log_count/WARNING': 2,
'response_received_count': 3459,
'scheduler/dequeued': 3459,
'scheduler/dequeued/memory': 3459,
'scheduler/enqueued': 3459,
'scheduler/enqueued/memory': 3459,
'start_time': datetime.datetime(2023, 1, 27, 21, 40, 20, 206203)}
2023-01-28 02:31:17 [scrapy.core.engine] INFO: Spider closed (finished)
Process finished with exit code 0
Run Code Online (Sandbox Code Playgroud)
乍一看,代码看起来不错。然而,我在这里看到了可以提高抓取速度的几点:
CONCURRENT_REQUESTS_PER_DOMAIN设置 - 因为它没有更改,所以保留默认值 8(同时不超过 8 个请求)。建议将其增加到 的值CONCURRENT_REQUESTS。CONCURRENT_ITEMS设置 - 我们有几份报告称增加此设置的值可能会导致性能下降 - /scrapy/issues/5182。建议保持默认。.xlsx文件 - 是包含 xml 文档的压缩存档。它使用openpyxl将所有文件内容和已解析的 xml 树保存在 RAM 内存中。.xlsx由于添加每个新行,每个添加的行都会增加创建文件的 xml 树的大小- 可能会占用更多的 CPU 资源。- 建议将抓取速度与 scrapy 内置提要导出器(csv 或 json 行)的使用进行比较。