我正在编写一个 CUDA 内核,每个线程必须完成以下任务:假设我有一个存储在共享内存中的无符号整数有序数组a
(n
第一个始终为 0),每个线程必须找到数组索引,i
使得a[i]
\ xe2\x89\xa4threadIdx.x
和a[i + 1]
>threadIdx.x
。
一个天真的解决方案可能是:
\n\nfor (i = 0; i < n - 1; i++)\n if (a[i + 1] > threadIdx.x) break;\n
Run Code Online (Sandbox Code Playgroud)\n\n但我认为这不是最好的方法......任何人都可以提出更好的建议吗?
\n和 Robert 一样,我认为二分搜索必须比 na\xc3\xafve 循环更快——与 O(N) 相比,二分搜索的操作计数上限是 O(log(n))对于循环。
\n\n我的实现非常简单:
\n\n#include <iostream>\n#include <climits>\n#include <assert.h>\n\n__device__ __host__\nint midpoint(int a, int b)\n{\n return a + (b-a)/2;\n}\n\n__device__ __host__\nint eval(int A[], int i, int val, int imin, int imax)\n{\n\n int low = (A[i] <= val);\n int high = (A[i+1] > val);\n\n if (low && high) {\n return 0;\n } else if (low) {\n return -1;\n } else {\n return 1;\n }\n}\n\n__device__ __host__\nint binary_search(int A[], int val, int imin, int imax)\n{\n while (imax >= imin) {\n int imid = midpoint(imin, imax);\n int e = eval(A, imid, val, imin, imax);\n if(e == 0) {\n return imid;\n } else if (e < 0) {\n imin = imid;\n } else { \n imax = imid;\n }\n }\n\n return -1;\n}\n\n\n__device__ __host__\nint linear_search(int A[], int val, int imin, int imax)\n{\n int res = -1;\n for(int i=imin; i<(imax-1); i++) {\n if (A[i+1] > val) {\n res = i;\n break;\n }\n }\n\n return res;\n}\n\ntemplate<int version>\n__global__\nvoid search(int * source, int * result, int Nin, int Nout)\n{\n extern __shared__ int buff[];\n int tid = threadIdx.x + blockIdx.x*blockDim.x;\n\n int val = INT_MAX;\n if (tid < Nin) val = source[threadIdx.x];\n buff[threadIdx.x] = val;\n __syncthreads();\n\n int res;\n switch(version) {\n\n case 0:\n res = binary_search(buff, threadIdx.x, 0, blockDim.x);\n break;\n\n case 1:\n res = linear_search(buff, threadIdx.x, 0, blockDim.x);\n break;\n }\n\n if (tid < Nout) result[tid] = res; \n}\n\nint main(void)\n{\n const int inputLength = 128000;\n const int isize = inputLength * sizeof(int);\n const int outputLength = 256;\n const int osize = outputLength * sizeof(int);\n\n int * hostInput = new int[inputLength];\n int * hostOutput = new int[outputLength];\n int * deviceInput;\n int * deviceOutput;\n\n for(int i=0; i<inputLength; i++) {\n hostInput[i] = -200 + 5*i;\n }\n\n cudaMalloc((void**)&deviceInput, isize);\n cudaMalloc((void**)&deviceOutput, osize);\n\n cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);\n\n dim3 DimBlock(256, 1, 1);\n dim3 DimGrid(1, 1, 1);\n DimGrid.x = (outputLength / DimBlock.x) + \n ((outputLength % DimBlock.x > 0) ? 1 : 0); \n size_t shmsz = DimBlock.x * sizeof(int);\n\n for(int i=0; i<5; i++) {\n search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput, \n inputLength, outputLength);\n }\n\n for(int i=0; i<5; i++) {\n search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,\n inputLength, outputLength);\n }\n\n cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);\n\n for(int i=0; i<outputLength; i++) {\n int idx = hostOutput[i];\n int tidx = i % DimBlock.x;\n assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );\n } \n cudaDeviceReset();\n\n return 0;\n}\n
Run Code Online (Sandbox Code Playgroud)\n\n与循环相比,速度提高了大约五倍:
\n\n>nvprof a.exe\n======== NVPROF is profiling a.exe...\n======== Command: a.exe\n======== Profiling result:\n Time(%) Time Calls Avg Min Max Name\n 60.11 157.85us 1 157.85us 157.85us 157.85us [CUDA memcpy HtoD]\n 32.58 85.55us 5 17.11us 16.63us 19.04us void search<int=1>(int*, int*, int, int)\n 6.52 17.13us 5 3.42us 3.35us 3.73us void search<int=0>(int*, int*, int, int)\n 0.79 2.08us 1 2.08us 2.08us 2.08us [CUDA memcpy DtoH]\n
Run Code Online (Sandbox Code Playgroud)\n\n我确信聪明的人可以做得更好。但也许这至少给了你一些想法。
\n