从 Python Numba CUDA 内核调用的加速 FFT

Question

从 Python Numba CUDA 内核调用的加速 FFT

我需要计算一个 256 个元素的 float64 信号的傅立叶变换。要求是我需要从 cuda.jitted 部分内部调用这些 FFT，并且必须在 25 秒内完成。唉，cuda.jit 编译的函数不允许调用外部库 => 我自己写的。唉，我的单核代码还是太慢了（在 Quadro P4000 上约为 250 微秒）。有没有更好的办法？

我创建了一个可以提供正确结果的单核 FFT 函数，但是速度太慢了 10 倍。我不明白如何充分利用多核。

---fft.py

from numba import cuda, boolean, void, int32, float32, float64, complex128
import math, sys, cmath

def _transform_radix2(vector, inverse, out):    
    n = len(vector) 
    levels = int32(math.log(float32(n))/math.log(float32(2)))

    assert 2**levels==n # error: Length is not a power of 2 

    #uncomment either Numba.Cuda or Numpy memory allocation, (intelligent conditional compileation??)               
    exptable = cuda.local.array(1024, dtype=complex128)   
    #exptable = np.zeros(1024, np.complex128)

    assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

    coef = complex128((2j if inverse else -2j) * math.pi / n)   
    for i in range(n // 2):                       
        exptable[i] = cmath.exp(i * coef)       

    for i in range(n):
        x = i   
        y = 0
        for j in range(levels):
            y = (y << 1) | (x & 1)
            x >>= 1
        out[i] = vector[y]      

    size = 2
    while size <= n:
        halfsize = size // 2
        tablestep = n // size
        for i in range(0, n, size):
            k = 0
            for j in range(i, i + halfsize):
                temp = out[j + halfsize] * exptable[k]    
                out[j + halfsize] = out[j] - temp
                out[j] += temp
                k += tablestep
        size *= 2

    scale=float64(n if inverse else 1)
    for i in range(n):
        out[i]=out[i]/scale   # the inverse requires a scaling

# now create the Numba.cuda version to be called by a GPU
gtransform_radix2 = cuda.jit(device=True)(_transform_radix2)

---test.py

from numba import cuda, void, float64, complex128, boolean
import cupy as cp
import numpy as np
import timeit  
import fft

@cuda.jit(void(float64[:],boolean, complex128[:]))    
def fftbench(y, inverse, FT):
  Y  = cuda.local.array(256, dtype=complex128)

  for i in range(len(y)):
    Y[i]=complex128(y[i])    
  fft.gtransform_radix2(Y, False, FT)


str='\nbest [%2d/%2d] iterations, min:[%9.3f], max:[%9.3f], mean:[%9.3f], std:[%9.3f] usec'
a=[127.734375 ,130.87890625 ,132.1953125  ,129.62109375 ,118.6015625
 ,110.2890625  ,106.55078125 ,104.8203125  ,106.1875     ,109.328125
 ,113.5        ,118.6640625  ,125.71875    ,127.625      ,120.890625
 ,114.04296875 ,112.0078125  ,112.71484375 ,110.18359375 ,104.8828125
 ,104.47265625 ,106.65625    ,109.53515625 ,110.73828125 ,111.2421875
 ,112.28125    ,112.38671875 ,112.7734375  ,112.7421875  ,113.1328125
 ,113.24609375 ,113.15625    ,113.66015625 ,114.19921875 ,114.5
 ,114.5546875  ,115.09765625 ,115.2890625  ,115.7265625  ,115.41796875
 ,115.73828125 ,116.         ,116.55078125 ,116.5625     ,116.33984375
 ,116.63671875 ,117.015625   ,117.25       ,117.41015625 ,117.6640625
 ,117.859375   ,117.91015625 ,118.38671875 ,118.51171875 ,118.69921875
 ,118.80859375 ,118.67578125 ,118.78125    ,118.49609375 ,119.0078125
 ,119.09375    ,119.15234375 ,119.33984375 ,119.31640625 ,119.6640625
 ,119.890625   ,119.80078125 ,119.69140625 ,119.65625    ,119.83984375
 ,119.9609375  ,120.15625    ,120.2734375  ,120.47265625 ,120.671875
 ,120.796875   ,120.4609375  ,121.1171875  ,121.35546875 ,120.94921875
 ,120.984375   ,121.35546875 ,120.87109375 ,120.8359375  ,121.2265625
 ,121.2109375  ,120.859375   ,121.17578125 ,121.60546875 ,121.84375
 ,121.5859375  ,121.6796875  ,121.671875   ,121.78125    ,121.796875
 ,121.8828125  ,121.9921875  ,121.8984375  ,122.1640625  ,121.9375
 ,122.         ,122.3515625  ,122.359375   ,122.1875     ,122.01171875
 ,121.91015625 ,122.11328125 ,122.1171875  ,122.6484375  ,122.81640625
 ,122.33984375 ,122.265625   ,122.78125    ,122.44921875 ,122.34765625
 ,122.59765625 ,122.63671875 ,122.6796875  ,122.6171875  ,122.34375
 ,122.359375   ,122.7109375  ,122.83984375 ,122.546875   ,122.25390625
 ,122.06640625 ,122.578125   ,122.7109375  ,122.83203125 ,122.5390625
 ,122.2421875  ,122.06640625 ,122.265625   ,122.13671875 ,121.8046875
 ,121.87890625 ,121.88671875 ,122.2265625  ,121.63671875 ,121.14453125
 ,120.84375    ,120.390625   ,119.875      ,119.34765625 ,119.0390625
 ,118.4609375  ,117.828125   ,117.1953125  ,116.9921875  ,116.046875
 ,115.16015625 ,114.359375   ,113.1875     ,110.390625   ,108.41796875
 ,111.90234375 ,117.296875   ,127.0234375  ,147.58984375 ,158.625
 ,129.8515625  ,120.96484375 ,124.90234375 ,130.17578125 ,136.47265625
 ,143.9296875  ,150.24609375 ,141.         ,117.71484375 ,109.80859375
 ,115.24609375 ,118.44140625 ,120.640625   ,120.9921875  ,111.828125
 ,101.6953125  ,111.21484375 ,114.91015625 ,115.2265625  ,118.21875
 ,125.3359375  ,139.44140625 ,139.76953125 ,135.84765625 ,137.3671875
 ,141.67578125 ,139.53125    ,136.44921875 ,135.08203125 ,135.7890625
 ,137.58203125 ,138.7265625  ,154.33203125 ,172.01171875 ,152.24609375
 ,129.8046875  ,125.59375    ,125.234375   ,127.32421875 ,132.8984375
 ,147.98828125 ,152.328125   ,153.7734375  ,155.09765625 ,156.66796875
 ,159.0546875  ,151.83203125 ,138.91796875 ,138.0546875  ,140.671875
 ,143.48046875 ,143.99609375 ,146.875      ,146.7578125  ,141.15234375
 ,141.5        ,140.76953125 ,140.8828125  ,145.5625     ,150.78125
 ,148.89453125 ,150.02734375 ,150.70703125 ,152.24609375 ,148.47265625
 ,131.95703125 ,125.40625    ,123.265625   ,123.57421875 ,129.859375
 ,135.6484375  ,144.51171875 ,155.05078125 ,158.4453125  ,140.8125
 ,100.08984375 ,104.29296875 ,128.55078125 ,139.9921875  ,143.38671875
 ,143.69921875 ,137.734375   ,124.48046875 ,116.73828125 ,114.84765625
 ,113.85546875 ,117.45703125 ,122.859375   ,125.8515625  ,133.22265625
 ,139.484375   ,135.75       ,122.69921875 ,115.7734375  ,116.9375
 ,127.57421875] 
y1 =cp.zeros(len(a), cp.complex128) 
FT1=cp.zeros(len(a), cp.complex128)

for i in range(len(a)):
  y1[i]=a[i]  #convert to complex to feed the FFT

r=1000
series=sorted(timeit.repeat("fftbench(y1, False, FT1)",      number=1, repeat=r, globals=globals()))
series=series[0:r-5]
print(str % (len(series), r, 1e6*np.min(series), 1e6*np.max(series), 1e6*np.mean(series), 1e6*np.std(series)));



a faster implementation t<<25usec

归档时间：	6 年，5 月前
查看次数：	1726 次
最近记录：	6 年，5 月前

从 Python Numba CUDA 内核调用的加速 FFT

运行多个线程

记忆

为每个线程分配任务

同步