Cuda：减少矩阵实现的（最大/最小）函数

Question

Cuda：减少矩阵实现的（最大/最小）函数

所以我正在研究矩阵上的归约函数：我需要在矩阵中找到最大值。我已经实现了一个函数来获取数组的最大值并将其转换为矩阵版本应该很简单，但我无法让它工作。我想知道这是否是正确的方法。您可以在下面找到两个版本的代码：

对于数组：

__global__
void reduce_kernal_shared_mem(float *d_in, float *d_out){
    int indx = blockDim.x * blockIdx.x + threadIdx.x;
    int tindx = threadIdx.x;

    extern __shared__ float sh_in[];

    sh_in[tindx] = -99999.0f;

    sh_in[tindx] = d_in[indx];
    __syncthreads();

    for(int i = blockDim.x / 2; i > 0; i >>= 1){
        if(tindx < i){
            sh_in[tindx] = fmax(sh_in[tindx], sh_in[tindx + i]);
        }
        __syncthreads();
    }

    if(tindx == 0){
        d_out[blockIdx.x] = sh_in[0];
    }
}

void reduce(float *d_in, float *d_int, float *d_out, const int ARRAY_SIZE, bool is_shared){
    if(!is_shared){
        reduce_kernal<<<1024, 1024>>>(d_in, d_int);
        reduce_kernal<<<1, 1024>>>(d_int, d_out);
    }else{
        reduce_kernal_shared_mem<<<1024, 1024, 1024 * sizeof(float)>>>(d_in, d_int);
        reduce_kernal_shared_mem<<<1, 1024, 1024 * sizeof(float)>>>(d_int, d_out);
    }
}

Run Code Online (Sandbox Code Playgroud)

对于矩阵：

__global__
void get_max(const float* d_logLuminance, float *d_out, int numRows, int numCols){
   int col = blockIdx.x * blockDim.x + threadIdx.x;
   int row = blockIdx.y * blockDim.y + threadIdx.y;
   int c_t = threadIdx.x;
   int r_t = threadIdx.y;
   int pos_1D = row * numCols + col;
   int pos_1D_t = r_t * blockDim.x + c_t;

   extern __shared__ float sh_mem[];

   sh_mem[pos_1D_t] = -999999.0f;

   if(pos_1D > numCols * numRows)
      return;

   sh_mem[pos_1D_t] = d_logLuminance[pos_1D];
   __syncthreads();

   for(int s = (blockDim.x * blockDim.y) / 2; s > 0; s >>= 1){
      if(pos_1D_t < s)
         sh_mem[pos_1D_t] = fmax(sh_mem[pos_1D_t], sh_mem[pos_1D_t + s]);
      __syncthreads();
   }

   if(r_t == 0 && c_t == 0)
      d_out[blockIdx.y * gridDim.x + blockIdx.x] = sh_mem[0];
}

void max(const float *d_logLuminance, int numRows, int numCols, float &max_logLum){

   int THREADS_PER_BLOCK = 32;
   dim3 blockSize(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
   dim3 gridSize((THREADS_PER_BLOCK + numCols - 1) / THREADS_PER_BLOCK, 
                  (THREADS_PER_BLOCK + numRows - 1) / THREADS_PER_BLOCK);

   float *d_out, *d_int;
   cudaMalloc(&d_out, sizeof(float) * numRows * numCols);
   cudaMalloc(&d_int, sizeof(float) * numRows * numCols);


   get_max<<<gridSize, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_logLuminance, d_int, numRows, numCols);
   get_max<<<1, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_int, d_out, numRows, numCols);

   cudaDeviceSynchronize();

   cudaMemcpy(&max_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);

   printf("max : %f\n", max_logLum);

   cudaFree(d_out);
   cudaFree(d_int);

}

Run Code Online (Sandbox Code Playgroud)

串行算法计算的预期结果是 2.18911 ，而并行 reduce 函数输出 1.319142 。

Answer 1

Rob*_*lla 5

您显示的代码都没有针对性能进行良好优化。在 GPU 上编写快速并行归约通常具有许多显着特征：

它将使用具有最佳选择线程数的grid-stride 循环来累积每个线程的部分结果，从而消耗整个数据集。
代替共享内存扫描式减少，它将使用 2 阶段扭曲洗牌方法，将每个线程块中的每线程结果组合成部分每线程块结果。
它将免除通常的 2-kernel-launch 方法来组合每个线程块的结果。取而代之的是，要么使用最后一个每线程块一个原子操作来更新全局变量（例如，如果与所需的归约操作对应的合适原子操作可用），要么它将使用线程块排空方法（“线程栅栏归约” ")，允许内核启动中的“最后一个”线程块完成最终减少到单个数量。

因此，要解决内核中的技术问题，让我们记住这是一个学习练习。我并不是建议您的方法（任一方法）是最好的方法。

在 2D 内核方法中，您需要考虑非法行为和设计问题：

这个构造：
```
if(pos_1D > numCols * numRows)
  return;                        // this return statement creates a hazard

sh_mem[pos_1D_t] = d_logLuminance[pos_1D];
__syncthreads();                 // ... at this call
```
Run Code Online (Sandbox Code Playgroud)
允许未定义行为的可能性。CUDA 要求__syncthreads()线程块中的所有线程都可以访问它。但是，该return语句可能允许某些线程块中的某些线程提前退出，这意味着它们不会参与对__syncthreads(). 这是一种非法的设计模式。
您的 2D 设计将最大大小的数据集限制为 32x32 块或 1024x1024 数据集大小的尺寸。要了解为什么会这样，请观察第二次内核启动最多可以有 1024 个线程，并且由于一个线程对应于前一次启动中的一个线程块，因此前一次启动最多可以有 32x32 = 1024 个线程块。可以重新编写代码以消除此限制，但是我在这里的意图是建议如果您想编写快速、健壮的并行缩减，则设计的几乎每个方面都必须重新编写，因此我建议从我概述的特征开始一开始。
您的第二个内核启动构建不正确：
```
get_max<<<1, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_int, d_out, numRows, numCols);
```
Run Code Online (Sandbox Code Playgroud)
此时数据集的大小不再是numRows* numCols。它已减小到 32x32 或更小的某个值。您的代码中还有其他几个类似的大小调整问题。

这是您的代码的修改版本，其中解决了上述问题。我没有详细说明我对您的代码所做的每一个更改，因此除了上面列出的 3 项之外，请研究差异：

$ cat t1490.cu
#include <stdio.h>

__global__
void get_max(const float* d_logLuminance, float *d_out, int numRows, int numCols){
   int col = blockIdx.x * blockDim.x + threadIdx.x;
   int row = blockIdx.y * blockDim.y + threadIdx.y;
   int c_t = threadIdx.x;
   int r_t = threadIdx.y;
   int pos_1D = row * numCols + col;
   int pos_1D_t = r_t * blockDim.x + c_t;

   extern __shared__ float sh_mem[];

   sh_mem[pos_1D_t] = (pos_1D >= numCols * numRows)?-999999.0f:d_logLuminance[pos_1D];
   __syncthreads();

   for(int s = (blockDim.x * blockDim.y) / 2; s > 0; s >>= 1){
      if(pos_1D_t < s)
         sh_mem[pos_1D_t] = fmax(sh_mem[pos_1D_t], sh_mem[pos_1D_t + s]);
      __syncthreads();
   }

   if(r_t == 0 && c_t == 0)
      d_out[blockIdx.y * gridDim.x + blockIdx.x] = sh_mem[0];
}

void max(const float *d_logLuminance, int numRows, int numCols, float &max_logLum){

   int THREADS_PER_BLOCK = 32;
   dim3 blockSize(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
   dim3 gridSize((THREADS_PER_BLOCK + numCols - 1) / THREADS_PER_BLOCK,
                  (THREADS_PER_BLOCK + numRows - 1) / THREADS_PER_BLOCK);

   float *d_out, *d_int;
   cudaMalloc(&d_out, sizeof(float));
   cudaMalloc(&d_int, sizeof(float) * gridSize.y*gridSize.x);


   get_max<<<gridSize, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_logLuminance, d_int, numRows, numCols);
   get_max<<<1, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_int, d_out, gridSize.y, gridSize.x);

   cudaDeviceSynchronize();

   cudaMemcpy(&max_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);

   printf("max : %f\n", max_logLum);

   cudaFree(d_out);
   cudaFree(d_int);

}

int main(){

   int sx = 1024;
   int sy = 1024;
   float *d_data, result = 2.18911;
   cudaMalloc(&d_data, sx*sy*sizeof(d_data[0]));
   cudaMemset(d_data, 0, sx*sy*sizeof(d_data[0]));
   cudaMemcpy(d_data, &result, sizeof(float), cudaMemcpyHostToDevice);
   result = 0;
   max(d_data, sy, sx, result);
}
$ nvcc -o t1490 t1490.cu
$ cuda-memcheck ./t1490
========= CUDA-MEMCHECK
max : 2.189110
========= ERROR SUMMARY: 0 errors
$

Run Code Online (Sandbox Code Playgroud)

请注意，您尚未提供完整的代码。在未显示的代码中也有可能存在错误。在我的回答中，我提供了一个完整代码的示例。

归档时间：	6 年，6 月前
查看次数：	252 次
最近记录：	6 年，6 月前