我编写了一个CUDA4 Bayer demosaicing例程,但它比单线程CPU代码慢,运行在16核GTS250上.
Blocksize是(16,16),图像变暗是16的倍数 - 但改变它不会改善它.
我做了什么明显愚蠢的事吗?
--------------- calling routine ------------------
uchar4 *d_output;
size_t num_bytes;
cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource);
// Do the conversion, leave the result in the PBO fordisplay
kernel_wrapper( imageWidth, imageHeight, blockSize, gridSize, d_output );
cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);
--------------- cuda -------------------------------
texture<uchar, 2, cudaReadModeElementType> tex;
cudaArray *d_imageArray = 0;
__global__ void convertGRBG(uchar4 *d_output, uint width, uint height)
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint i = …Run Code Online (Sandbox Code Playgroud)