And*_*ani 6 python performance python-asyncio aiohttp
我正在使用 asyncio 和 aiohttp 来制作异步抓取工具。出于某种原因,在我达到 150+ 请求后,它开始变慢。第一个异步在我获得链接的地方运行良好。第二个是我遇到缓慢发生的问题的地方。就像在 200 之后,一个请求需要 1 分钟。知道为什么吗?我是否错误地使用了 Asyncio 或 aiohttp?编辑:我在 7gb ram 上运行这个本地,所以我不认为我内存不足。
import aiohttp
import asyncio
import async_timeout
import re
from lxml import html
import timeit
from os import makedirs,chmod
basepath = ""
start = timeit.default_timer()
novel = ""
novel = re.sub(r"[^a-zA-Z0-9 ]+/", "", novel)
novel = re.sub(r" ", "-", novel)
novel_url = {}
@asyncio.coroutine
def get(*args, **kwargs):
response = yield from aiohttp.request('GET', *args, **kwargs)
return (yield from response.text())
def scrape_links(page):
url = html.fromstring(page)
links = url.xpath("")
chapter_count = url.xpath("")
dictonaries = dict(zip(chapter_count,links))
novel_url.update(dictonaries)
def print_links(query):
# Makedirs and apply chmod
makedirs('%s/%s' % ( basepath,query ),exist_ok=True)
makedirs('%s/%s/img' % (basepath, query),exist_ok=True)
chmod('%s/%s' % ( basepath,query ), 0o765)
chmod('%s/%s/img/' % ( basepath,query ), 0o765)
url = 'https://www.examplesite.org/' + query
page = yield from get(url, compress=True)
magnet = scrape_links(page)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_links(novel)])
loop.run_until_complete(f)
##### now getting chapters from links array
def scrape_chapters(page, i):
url = html.fromstring(page)
title = url.xpath("")
title = ''.join(title)
title = re.sub(r"", "", title)
chapter = url.xpath("")
# Use this to join them insteed of looping though if it doesn't work in epub maker
# chapter = '\n'.join(chapter)
print(title)
# file = open("%s/%s/%s-%s.html" % (basepath, novel, novel, i), 'w+')
# file.write("<h1>%s</h1>" % title)
# for x in chapter:
# file.write("\n<p>%s</p>" % x)
# file.close()
def print_chapters(query):
chapter = (str(query[0]))
chapter_count = re.sub(r"CH ", "", chapter)
page = yield from get(query[1], compress=True)
chapter = scrape_chapters(page, chapter_count)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_chapters(d) for d in novel_url.items()])
loop.run_until_complete(f)
stop = timeit.default_timer()
print("\n")
print(stop - start)
Run Code Online (Sandbox Code Playgroud)
小智 0
会不会是连接数限制的原因aiohttp.ClientSession?
https://docs.aiohttp.org/en/latest/http_request_lifecycle.html#how-to-use-the-clientsession
它可能会尝试传递具有更大限制的连接器:https://docs.aiohttp.org/en/latest/client_advanced.html#limiting-connection-pool-size