如何使用“ requests”模块python进行简单的快速请求?

kat*_*lax 4 python performance beautifulsoup python-requests

我是python的初学者,我只是想用模块requestsBeautifulSoup网站提出我要求的网站

和我的简单代码:

import requests, time, re, json
from bs4 import BeautifulSoup as BS

url = "https://www.jobstreet.co.id/en/job-search/job-vacancy.php?ojs=6"

def list_jobs():
    try:
        with requests.session() as s:
            st = time.time()
            s.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
            req = s.get(url)
            soup = BS(req.text,'html.parser')
            attr = soup.findAll('div',class_='position-title header-text')
            pttr = r".?(.*)Rank=\d+"
            lists = {"status":200,"result":[]}
            for a in attr:
                sr = re.search(pttr, a.find("a")["href"])
                if sr:
                    title = a.find('a')['title'].replace("Lihat detil lowongan -","").replace("\r","").replace("\n","")
                    url = a.find('a')['href']
                    lists["result"].append({
                        "title":title,
                        "url":url,
                        "detail":detail_jobs(url)
                    })
            print(json.dumps(lists, indent=4))
            end = time.time() - st
            print(f"\n{end} second")
    except:
        pass

def detail_jobs(find_url):
    try:
        with requests.session() as s:
            s.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
            req = s.get(find_url)
            soup = BS(req.text,'html.parser')
            position = soup.find('h1',class_='job-position').text
            name = soup.find('div',class_='company_name').text.strip("\t")
            try:
                addrs = soup.find('div',class_='map-col-wraper').find('p',{'id':'address'}).text
            except Exception:
                addrs = "Unknown"
            try:
                loct = soup.find('span',{'id':'single_work_location'}).text
            except Exception:
                loct = soup.find('span',{'id':'multiple_work_location_list'}).find('span',{'class':'show'}).text        
            dests = soup.findAll('div',attrs={'id':'job_description'})
            for select in dests:
                txt = select.text if not select.text.startswith("\n") or not select.text.endswith("\n") else select.text.replace("\n","")
                result = {
                    "name":name,
                    "location":loct,
                    "position":position,
                    "description":txt,
                    "address":addrs
                }
                return result
    except:
        pass
Run Code Online (Sandbox Code Playgroud)

它们都工作良好,但要花很长时间才能显示结果时间始终在13/17秒以上

我不知道如何提高请求速度

我尝试在堆栈和Google上搜索,他们说使用asyncio,但这种方式对我来说很难。

如果有人有简单的技巧,如何以简单的方式增加速度,我会很感激..

对不起,我英语不好

Pra*_*iel 6

通过诸如Web抓取之类的项目学习Python真是棒极了。这就是我被介绍给Python的方式。也就是说,要提高报废速度,您可以做三件事:

  1. 将html解析器更改为更快的速度。html.parser是最慢的。尝试更改为“ lxml”或“ html5lib”。(阅读https://www.crummy.com/software/BeautifulSoup/bs4/doc/

在此处输入图片说明

  1. 放下循环和正则表达式,因为它们会使脚本变慢。只需使用BeautifulSoup工具,文本和标签条,然后找到正确的标签即可(请参见下面的脚本)

  2. 由于网络抓取的瓶颈通常是IO,因此等待使用异步或多线程从网页获取数据将提高速度。在下面的脚本中,我使用了多线程。目的是同时从多个页面中提取数据。

因此,如果我们知道最大页面数,则可以将请求分块到不同的范围,然后分批提取它们:)

代码示例:

from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

import requests
from bs4 import BeautifulSoup as bs

data = defaultdict(list)

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}

def get_data(data, headers, page=1):

    # Get start time
    start_time = datetime.now()
    url = f'https://www.jobstreet.co.id/en/job-search/job-vacancy/{page}/?src=20&srcr=2000&ojs=6'
    r = requests.get(url, headers=headers)

    # If the requests is fine, proceed
    if r.ok:
        jobs = bs(r.content,'lxml').find('div',{'id':'job_listing_panel'})
        data['title'].extend([i.text.strip() for i in jobs.find_all('div',{'class':'position-title header-text'})])
        data['company'].extend([i.text.strip() for i in jobs.find_all('h3',{'class':'company-name'})])
        data['location'].extend([i['title'] for i in jobs.find_all('li',{'class':'job-location'})] )
        data['desc'].extend([i.text.strip() for i in jobs.find_all('ul',{'class':'list-unstyled hidden-xs '})])
    else:
        print('connection issues')
    print(f'Page: {page} | Time taken {datetime.now()-start_time}')
    return data


def multi_get_data(data,headers,start_page=1,end_page=20,workers=20):
    start_time = datetime.now()
    # Execute our get_data in multiple threads each having a different page number
    with ThreadPoolExecutor(max_workers=workers) as executor:
        [executor.submit(get_data, data=data,headers=headers,page=i) for i in range(start_page,end_page+1)]

    print(f'Page {start_page}-{end_page} | Time take {datetime.now() -     start_time}')
    return data


# Test page 10-15
k = multi_get_data(data,headers,start_page=10,end_page=15)
Run Code Online (Sandbox Code Playgroud)

结果: 在此处输入图片说明

解释multi_get_data函数:

此函数将在传递所需参数的不同线程中调用get_data函数。此刻,每个线程获得一个不同的页码来调用。工人的最大数量设置为20,即20个线程。您可以相应地增加或减少。

我们已经创建了变量数据,这是一个默认的字典,其中包含列表。所有线程将填充此数据。然后可以将此变量转换为json或Pandas DataFrame :)

如您所见,我们有5个请求,每个请求所用的时间不到2秒,但总数仍不到2秒;)

享受网页抓取。

更新_:22/12/2019

通过使用具有单个标头更新的会话,我们还可以提高速度。因此,我们不必每次通话都开始会话。

from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

import requests
from bs4 import BeautifulSoup as bs

data = defaultdict(list)

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}

def get_data(data, headers, page=1):

    # Get start time
    start_time = datetime.now()
    url = f'https://www.jobstreet.co.id/en/job-search/job-vacancy/{page}/?src=20&srcr=2000&ojs=6'
    r = requests.get(url, headers=headers)

    # If the requests is fine, proceed
    if r.ok:
        jobs = bs(r.content,'lxml').find('div',{'id':'job_listing_panel'})
        data['title'].extend([i.text.strip() for i in jobs.find_all('div',{'class':'position-title header-text'})])
        data['company'].extend([i.text.strip() for i in jobs.find_all('h3',{'class':'company-name'})])
        data['location'].extend([i['title'] for i in jobs.find_all('li',{'class':'job-location'})] )
        data['desc'].extend([i.text.strip() for i in jobs.find_all('ul',{'class':'list-unstyled hidden-xs '})])
    else:
        print('connection issues')
    print(f'Page: {page} | Time taken {datetime.now()-start_time}')
    return data


def multi_get_data(data,headers,start_page=1,end_page=20,workers=20):
    start_time = datetime.now()
    # Execute our get_data in multiple threads each having a different page number
    with ThreadPoolExecutor(max_workers=workers) as executor:
        [executor.submit(get_data, data=data,headers=headers,page=i) for i in range(start_page,end_page+1)]

    print(f'Page {start_page}-{end_page} | Time take {datetime.now() -     start_time}')
    return data


# Test page 10-15
k = multi_get_data(data,headers,start_page=10,end_page=15)
Run Code Online (Sandbox Code Playgroud)