我有两个版本的相同算法.它最初是卷积,但我修改它以减少它以检查我的瓶颈在哪里(注意每个循环只有一次访问全局内存):
__global__
void convolve (unsigned char * Md, float * Kd, unsigned char * Rd, int width, int height, int kernel_size, int tile_width, int channels){
int row = blockIdx.y*tile_width + threadIdx.y;
int col = blockIdx.x*tile_width + threadIdx.x;
int sum = 0;
int pixel;
int local_pixel;
int working_pixel;
int row_offset = (kernel_size/2)*(width+kernel_size-1);
int col_offset = kernel_size/2;
for(int color=0; color<channels; color++){
pixel = color*width*height + row*width + col;
local_pixel = color*(width+kernel_size-1)*(height+kernel_size-1) + row*(width+kernel_size-1) + col + row_offset + col_offset;
if(row < height …Run Code Online (Sandbox Code Playgroud)