CUDA输出始终为0

Question

CUDA输出始终为0

执行内核函数后，打印输出始终为0。经过一番测试，cudaMemcpy 仍然正确。但内核似乎无法正常工作，无法从 d_inputs 获取正确的数据。有人可以帮忙解释一下吗？谢谢！

#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>

#define N 32

__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid<N) {
    double val =(double) d_inputs[tid];
    /*for (int iter=0; iter < niters; iter++){
    val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
    val = (val / 3.0) + 102.0;
    val = (val + 1.07) - 103.0;
    val = (val / 1.037) + 104.0;
    val = (val + 3.00) - 105.0;
    val = (val / 0.22) + 106.0;
    }*/
    val = val + 1.0;
    //printf("This is %f\n",val);
    d_outputs[tid] = val;
}
}

int main(int argc, char **argv)
{

    int niters = 10;
    printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);

    int inputs[N];
    for (int i = 0; i<N; i++){
    inputs[i] = i+1;
    }

    int d_inputs[N];
    double d_outputs[N];
    double outputs[N];

    cudaMalloc( (void**)&d_inputs, N*sizeof(int));
    cudaMalloc( (void**)&d_outputs, N*sizeof(double));
    printf("test %d \n", inputs[3]);
    cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
    printf("test %d \n", d_inputs[1]);
    Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
    //cudaDeviceSynchronize();
    cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
    for(int j =0;j<10; j++){
        printf("Outputs[%d] is:  %f and %f\n",j, d_outputs[j], outputs[j]);
        }
    cudaFree(d_inputs);
    cudaFree(d_outputs);

    return EXIT_SUCCESS;
}

Run Code Online (Sandbox Code Playgroud)

Answer 1

Rob*_*lla 5

每当您在使用 CUDA 代码时遇到问题时，您都应该使用适当的 cuda 错误检查并使用 , 运行您的代码cuda-memcheck，然后再向其他人寻求帮助。即使您不理解错误输出，它对于其他试图帮助您的人来说也是有用的。如果您在此处使用了正确的 cuda 错误检查，您将被告知您的 cudaMemcpy 操作由于下面的第 3 项而报告无效参数。
您的代码将无法编译。 cpu没有在任何地方定义。
我们不会像这样分配或创建设备指针：
```
int d_inputs[N];
double d_outputs[N];
```
Run Code Online (Sandbox Code Playgroud)
这些正在创建堆栈变量（数组），允许编译器将其视为常量指针。相反，你应该这样做：
```
int *d_inputs;
double *d_outputs;
```
Run Code Online (Sandbox Code Playgroud)
编译器知道这些是可修改的指针（稍后您将使用进行修改cudaMalloc）。
一旦解决了第 3 项中的问题，这将不再合法：
```
printf("test %d \n", d_inputs[1]);
```
Run Code Online (Sandbox Code Playgroud)
因为这需要在主机代码中取消引用设备指针（d_inputs），这在 CUDA 中是非法的，至少正如您在这里所做的那样。printf您的代码后面的语句中也有类似的问题（使用d_outputs）。

以下代码在某种程度上解决了上述问题，并且似乎对我来说可以正确运行：

$ cat t44.cu
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>

#define N 32

__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid<N) {
    double val =(double) d_inputs[tid];
    /*for (int iter=0; iter < niters; iter++){
    val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
    val = (val / 3.0) + 102.0;
    val = (val + 1.07) - 103.0;
    val = (val / 1.037) + 104.0;
    val = (val + 3.00) - 105.0;
    val = (val / 0.22) + 106.0;
    }*/
    val = val + 1.0;
    //printf("This is %f\n",val);
    d_outputs[tid] = val;
}
}

int main(int argc, char **argv)
{

    int niters = 10;
    int cpu = 0;
    printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);

    int inputs[N];
    for (int i = 0; i<N; i++){
    inputs[i] = i+1;
    }

    int *d_inputs;
    double *d_outputs;
    double outputs[N];

    cudaMalloc( (void**)&d_inputs, N*sizeof(int));
    cudaMalloc( (void**)&d_outputs, N*sizeof(double));
    printf("test %d \n", inputs[3]);
    cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
//    printf("test %d \n", d_inputs[1]);
    Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
    //cudaDeviceSynchronize();
    cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
    for(int j =0;j<10; j++){
        printf("Outputs[%d] is: %f\n",j, outputs[j]);
        }
    cudaFree(d_inputs);
    cudaFree(d_outputs);

    return EXIT_SUCCESS;
}
$ nvcc -lineinfo -arch=sm_61 -o t44 t44.cu
$ cuda-memcheck ./t44
========= CUDA-MEMCHECK
Iterate 10 times with GPU 0 or CPU 1: 0
test 4
Outputs[0] is: 2.000000
Outputs[1] is: 3.000000
Outputs[2] is: 4.000000
Outputs[3] is: 5.000000
Outputs[4] is: 6.000000
Outputs[5] is: 7.000000
Outputs[6] is: 8.000000
Outputs[7] is: 9.000000
Outputs[8] is: 10.000000
Outputs[9] is: 11.000000
========= ERROR SUMMARY: 0 errors
$

Run Code Online (Sandbox Code Playgroud)

归档时间：	9 年，1 月前
查看次数：	1620 次
最近记录：	9 年，1 月前