在python中加速随机加权选择而无需替换

Nik*_*Nik 5 python random numpy python-3.x

我想采样~10?来自约 10 人的次数?没有替换和权重的整数,每次选择 10 个元素。每次采样后,我都会改变权重。我在以下脚本中为两种方法(python3 和 numpy)计时。这两种方法对我来说似乎都非常缓慢,你有没有办法加快速度?

import numpy as np
import random

@profile
def test_choices():
    population = list(range(10**7))
    weights = np.random.uniform(size=10**7)
    np_weights = np.array(weights)

    def numpy_choice():
        np_w = np_weights / sum(np_weights)
        c = np.random.choice(population, size=10, replace=False, p=np_w)

    def python_choice():
        c = []
        while len(c) < 10:
            c += random.choices(population=population, weights=weights, k=10 - len(c))
            c = list(set(c))

    for i in range(10**1):

        numpy_choice()
        python_choice()

        add_weight = np.random.uniform()
        random_element = random.randint(0, 10**7)
        weights[random_element] += add_weight
        np_weights[random_element] += add_weight


test_choices()
Run Code Online (Sandbox Code Playgroud)

随着计时器结果:

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    24        10   20720062.0 2072006.2     56.6          numpy_choice()
    25        10   15593925.0 1559392.5     42.6          python_choice()
Run Code Online (Sandbox Code Playgroud)

max*_*111 3

这只是对 jdhesa 答案的评论。问题是考虑仅增加一个权重的情况是否有用 -> 是的!

\n

例子

\n
@nb.njit(parallel=True)\ndef numba_choice_opt(population, weights, k,wc,b_full_wc_calc,ind,value):\n    # Get cumulative weights\n    if b_full_wc_calc:\n        acc=0\n        for i in range(weights.shape[0]):\n            acc+=weights[i]\n            wc[i]=acc\n    #Increase only one weight (faster than recalculating the cumulative  weight)\n    else:\n        weights[ind]+=value\n        for i in nb.prange(ind,wc.shape[0]):\n            wc[i]+=value\n\n    # Total of weights\n    m = wc[-1]\n    # Arrays of sample and sampled indices\n    sample = np.empty(k, population.dtype)\n    sample_idx = np.full(k, -1, np.int32)\n    # Sampling loop\n    i = 0\n    while i < k:\n        # Pick random weight value\n        r = m * np.random.rand()\n        # Get corresponding index\n        idx = np.searchsorted(wc, r, side='right')\n        # Check index was not selected before\n        # If not using Numba you can just do `np.isin(idx, sample_idx)`\n        for j in range(i):\n            if sample_idx[j] == idx:\n                continue\n        # Save sampled value and index\n        sample[i] = population[idx]\n        sample_idx[i] = population[idx]\n        i += 1\n    return sample\n
Run Code Online (Sandbox Code Playgroud)\n

例子

\n
np.random.seed(0)\npopulation = np.random.randint(100, size=1_000_000)\nweights = np.random.rand(len(population))\nk = 10\nwc = np.empty_like(weights)\n\n#Initial calculation \n%timeit numba_choice_opt(population, weights, k,wc,True,0,0)\n#1.41 ms \xc2\xb1 9.21 \xc2\xb5s per loop (mean \xc2\xb1 std. dev. of 7 runs, 1 loop each)\n\n#Increase weight[100] by 3 and calculate\n%timeit numba_choice_opt(population, weights, k,wc,False,100,3)\n#213 \xc2\xb5s \xc2\xb1 6.06 \xc2\xb5s per loop (mean \xc2\xb1 std. dev. of 7 runs, 1000 loops each)\n\n#For comparison\n#Please note that it is the memory allcocation of wc which makes\n#it so much slower than the initial calculation from above\n%timeit numba_choice(population, weights, k)\n#4.23 ms \xc2\xb1 64.9 \xc2\xb5s per loop (mean \xc2\xb1 std. dev. of 7 runs, 1 loop each)\n
Run Code Online (Sandbox Code Playgroud)\n