gpugems3 中的前缀扫描 CUDA 示例代码是否正确？

Question

gpugems3 中的前缀扫描 CUDA 示例代码是否正确？

我在 GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA一书中写了一段代码来调用内核。

然而，我得到的结果是一堆负数而不是前缀扫描。

我的内核调用是错误的还是 GPU Gems 3 书中的代码有问题？

这是我的代码：

#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>

__global__ void kernel(int *g_odata, int  *g_idata, int n, int dim)
{
    extern __shared__ int temp[];// allocated on invocation
    int thid = threadIdx.x;
    int offset = 1;

    temp[2*thid] = g_idata[2*thid]; // load input into shared memory
    temp[2*thid+1] = g_idata[2*thid+1];
    for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
    {
    __syncthreads();
    if (thid < d)
    {
    int ai = offset*(2*thid+1)-1;
    int bi = offset*(2*thid+2)-1;
    temp[bi] += g_idata[ai];
    }
    offset *= 2;
    }
    if (thid == 0) { temp[n - 1] = 0; } // clear the last element
    for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
    {
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
    int ai = offset*(2*thid+1)-1;
    int bi = offset*(2*thid+2)-1;
    int t = temp[ai];
    temp[ai] = temp[bi];
    temp[bi] += t;
    }
    }
    __syncthreads();
    g_odata[2*thid] = temp[2*thid]; // write results to device memory
    g_odata[2*thid+1] = temp[2*thid+1];
}

void Initialize(int  *h_in,int num_items)
{
    int j;
    for(j=0;j<num_items;j++)

        h_in[j]=j;
        printf(" input: ");
            printf("\n\n");
}

int main(int argc, char** argv)
{
    int num_items = 512;

    int*  h_in = new int[num_items];

    // Initialize problem 
    Initialize(h_in, num_items);

    int *d_in = NULL;
    cudaMalloc((void**)&d_in, sizeof(int) * num_items);

    if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");

    // Allocate device output array
    int *d_out = NULL;
    cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));

    kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);

    int* h_out= new int[num_items+1];
    if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
    int i;
    printf(" \n");
    for(i=0;i<num_items;i++)
    printf(" ,%d ",h_out[i]);
    // Cleanup
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (d_in) cudaFree(d_in);
    if (d_out) cudaFree(d_out);

    printf("\n\n");

    return 0;
}

Run Code Online (Sandbox Code Playgroud)

Answer 1

Rob*_*lla 6

在将 GPU Gems 3章节中的代码转录到内核中时，您似乎至少犯了 1 个错误。这一行是不正确的：

temp[bi] += g_idata[ai];

Run Code Online (Sandbox Code Playgroud)

它应该是：

temp[bi] += temp[ai];

Run Code Online (Sandbox Code Playgroud)

当我对您现在发布的代码进行这一更改时，它似乎为我打印出正确的（独占扫描）前缀和。还有一些其他的事情我想提一下：

即使没有这种变化，我也会得到一些接近正确的结果。因此，如果您得到截然不同的东西（例如负数），则您的机器设置或 CUDA 安装可能有问题。我建议使用比现在更严格的cuda 错误检查（尽管您的检查中应该已经指出了机器设置问题。）
精心制作的例程会有一些限制。它只能在单个线程块中使用，在共享内存访问时会发生库冲突，并且数据集大小将被限制为单个线程块可以处理的大小（此例程每个线程产生两个输出元素，因此数据集大小预计等于线程数的两倍）。如前所述，动态共享内存分配需要与数据集大小一样大（即线程大小的两倍，元素数量）。
这可能对学习有用，但如果您想要一个健壮、快速的前缀扫描，建议您使用来自推力或幼崽的例程，而不是您自己的代码，即使源自这篇（旧）文章。

以下代码与您的代码类似，但它修复了上述问题，并且我已将内核模板化以用于各种数据类型：

#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


typedef int mytype;

template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
  extern __shared__ T temp[];  // allocated on invocation
  int thid = threadIdx.x;
  int offset = 1;
  temp[2*thid] = g_idata[2*thid]; // load input into shared memory
  temp[2*thid+1] = g_idata[2*thid+1];
  for (int d = n>>1; d > 0; d >>= 1)                    // build sum in place up the tree
  {
    __syncthreads();
    if (thid < d)
    {
      int ai = offset*(2*thid+1)-1;
      int bi = offset*(2*thid+2)-1;
      temp[bi] += temp[ai];
    }
    offset *= 2;
  }
  if (thid == 0) { temp[n - 1] = 0; } // clear the last element
  for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
    {
      offset >>= 1;
      __syncthreads();
      if (thid < d)
      {
         int ai = offset*(2*thid+1)-1;
         int bi = offset*(2*thid+2)-1;
         T t = temp[ai];
         temp[ai] = temp[bi];
         temp[bi] += t;
      }
    }
  __syncthreads();
  g_odata[2*thid] = temp[2*thid]; // write results to device memory
  g_odata[2*thid+1] = temp[2*thid+1];
}

int main(){

  mytype *h_i, *d_i, *h_o, *d_o;
  int dszp = (DSIZE)*sizeof(mytype);

  h_i = (mytype *)malloc(dszp);
  h_o = (mytype *)malloc(dszp);
  if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
  cudaMalloc(&d_i, dszp);
  cudaMalloc(&d_o, dszp);
  cudaCheckErrors("cudaMalloc fail");
  for (int i = 0 ; i < DSIZE; i++){
    h_i[i] = i;
    h_o[i] = 0;}
  cudaMemset(d_o, 0, dszp);
  cudaCheckErrors("cudaMemset fail");
  cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy 1 fail");
  prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy 2 fail");
  mytype psum = 0;
  for (int i =1; i < DSIZE; i++){
    psum += h_i[i-1];
    if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
    }
  return 0;
}

Run Code Online (Sandbox Code Playgroud)

归档时间：	10 年，8 月前
查看次数：	1753 次
最近记录：	6 年，3 月前