在 CUDA 内核中搜索有序数组

Fil*_*ffa 4 arrays cuda

我正在编写一个 CUDA 内核,每个线程必须完成以下任务:假设我有一个存储在共享内存中的无符号整数有序数组an第一个始终为 0),每个线程必须找到数组索引,i使得a[i]\ xe2\x89\xa4threadIdx.xa[i + 1]>threadIdx.x

\n\n

一个天真的解决方案可能是:

\n\n
for (i = 0; i < n - 1; i++)\n    if (a[i + 1] > threadIdx.x) break;\n
Run Code Online (Sandbox Code Playgroud)\n\n

但我认为这不是最好的方法......任何人都可以提出更好的建议吗?

\n

tal*_*ies 5

和 Robert 一样,我认为二分搜索必须比 na\xc3\xafve 循环更快——与 O(N) 相比,二分搜索的操作计数上限是 O(log(n))对于循环。

\n\n

我的实现非常简单:

\n\n
#include <iostream>\n#include <climits>\n#include <assert.h>\n\n__device__  __host__\nint midpoint(int a, int b)\n{\n    return a + (b-a)/2;\n}\n\n__device__ __host__\nint eval(int A[], int i, int val, int imin, int imax)\n{\n\n    int low = (A[i] <= val);\n    int high = (A[i+1] > val);\n\n    if (low && high) {\n        return 0;\n    } else if (low) {\n        return -1;\n    } else {\n        return 1;\n    }\n}\n\n__device__ __host__\nint binary_search(int A[], int val, int imin, int imax)\n{\n    while (imax >= imin) {\n        int imid = midpoint(imin, imax);\n        int e = eval(A, imid, val, imin, imax);\n        if(e == 0) {\n            return imid;\n        } else if (e < 0) {\n            imin = imid;\n        } else {         \n            imax = imid;\n        }\n    }\n\n    return -1;\n}\n\n\n__device__ __host__\nint linear_search(int A[], int val, int imin, int imax)\n{\n    int res = -1;\n    for(int i=imin; i<(imax-1); i++) {\n        if (A[i+1] > val) {\n            res = i;\n            break;\n        }\n    }\n\n    return res;\n}\n\ntemplate<int version>\n__global__\nvoid search(int * source, int * result, int Nin, int Nout)\n{\n    extern __shared__ int buff[];\n    int tid = threadIdx.x + blockIdx.x*blockDim.x;\n\n    int val = INT_MAX;\n    if (tid < Nin) val = source[threadIdx.x];\n    buff[threadIdx.x] = val;\n    __syncthreads();\n\n    int res;\n    switch(version) {\n\n        case 0:\n        res = binary_search(buff, threadIdx.x, 0, blockDim.x);\n        break;\n\n        case 1:\n        res = linear_search(buff, threadIdx.x, 0, blockDim.x);\n        break;\n    }\n\n    if (tid < Nout) result[tid] = res; \n}\n\nint main(void)\n{\n    const int inputLength = 128000;\n    const int isize = inputLength * sizeof(int);\n    const int outputLength = 256;\n    const int osize = outputLength * sizeof(int);\n\n    int * hostInput = new int[inputLength];\n    int * hostOutput = new int[outputLength];\n    int * deviceInput;\n    int * deviceOutput;\n\n    for(int i=0; i<inputLength; i++) {\n        hostInput[i] = -200 + 5*i;\n    }\n\n    cudaMalloc((void**)&deviceInput, isize);\n    cudaMalloc((void**)&deviceOutput, osize);\n\n    cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);\n\n    dim3 DimBlock(256, 1, 1);\n    dim3 DimGrid(1, 1, 1);\n    DimGrid.x = (outputLength / DimBlock.x) + \n                ((outputLength % DimBlock.x > 0) ? 1 : 0); \n    size_t shmsz = DimBlock.x * sizeof(int);\n\n    for(int i=0; i<5; i++) {\n        search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput, \n                inputLength, outputLength);\n    }\n\n    for(int i=0; i<5; i++) {\n        search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,\n                inputLength, outputLength);\n    }\n\n    cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);\n\n    for(int i=0; i<outputLength; i++) {\n        int idx = hostOutput[i];\n        int tidx = i % DimBlock.x;\n        assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );\n    } \n    cudaDeviceReset();\n\n    return 0;\n}\n
Run Code Online (Sandbox Code Playgroud)\n\n

与循环相比,速度提高了大约五倍:

\n\n
>nvprof a.exe\n======== NVPROF is profiling a.exe...\n======== Command: a.exe\n======== Profiling result:\n Time(%)      Time   Calls       Avg       Min       Max  Name\n   60.11  157.85us       1  157.85us  157.85us  157.85us  [CUDA memcpy HtoD]\n   32.58   85.55us       5   17.11us   16.63us   19.04us  void search<int=1>(int*, int*, int, int)\n    6.52   17.13us       5    3.42us    3.35us    3.73us  void search<int=0>(int*, int*, int, int)\n    0.79    2.08us       1    2.08us    2.08us    2.08us  [CUDA memcpy DtoH]\n
Run Code Online (Sandbox Code Playgroud)\n\n

我确信聪明的人可以做得更好。但也许这至少给了你一些想法。

\n