Sea*_*ean 3 c++ floating-point precision cuda
我尝试使用 GPU 来加速我的程序,该程序计算两个浮点数组之间的 L2 距离。为了检查计算精度,我编写了CUDA程序和CPU程序。但是我发现总误差有200多,我不明白。我在这两种情况下都使用 float 类型,我相信我应该得到相同的结果。我的代码如下。
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
// #include <helper_functions.h>
#define VECTORDIM 3
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6);
}
void DistanceCPU(float* array1, float* array2, int narray1, int narray2, float* output)
{
float temp;
for (int i = 0; i < narray1; i++)
{
for (int j = 0; j < narray2; j++)
{
temp = 0;
for (int l = 0; l < VECTORDIM; l++)
{
temp += powf(array1[i + l * narray1] - array2[j + l * narray2], 2);
}
output[i * narray2 + j] = temp;
}
}
}
__global__ void DistGPU(float* array1, float* array2, int narray1, int narray2, float* output)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
float temp;
if (i < narray1)
{
for (int j = 0; j < narray2; j++)
{
temp = 0;
temp += powf(array1[i] - array2[j], 2);
temp += powf(array1[i + narray1] - array2[j + narray2], 2);
temp += powf(array1[i + 2 * narray1] - array2[j + 2 * narray2], 2);
output[i * narray2 + j] = temp;
}
}
}
int main()
{
int narray1 = 7000;
int narray2 = 60000;
float* array1 = new float[narray1 * VECTORDIM];
float* array2 = new float[narray2 * VECTORDIM];
float* outputGPU = new float[narray1 * narray2];
float* outputCPU = new float[narray1 * narray2];
float* outputCPUTest = new float[narray1 * narray2];
float* d_array1;
float* d_array2;
float* d_output;
for (int i = 0; i < narray1 * VECTORDIM; i++)
{
array1[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
// std::cout << "Element " << i << " " << array1[i] << std::endl;
}
for (int i = 0; i < narray2 * VECTORDIM; i++)
{
array2[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
}
cudaError_t err;
err = cudaMalloc((void**)&d_array1, narray1 * VECTORDIM * sizeof(float));
err = cudaMalloc((void**)&d_array2, narray2 * VECTORDIM * sizeof(float));
err = cudaMalloc((void**)&d_output, narray1 * narray2 * sizeof(float));
err = cudaMemcpy(d_array1, array1, narray1 * VECTORDIM * sizeof(float), cudaMemcpyHostToDevice);
err = cudaMemcpy(d_array2, array2, narray2 * VECTORDIM * sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = 512;
int blocksPerGrid = (narray1 + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
double iStart = cpuSecond();
DistGPU<<<blocksPerGrid, threadsPerBlock>>>(d_array1, d_array2, narray1, narray2, d_output);
double iElaps = cpuSecond() - iStart;
err = cudaMemcpy(outputGPU, d_output, narray1 * narray2 * sizeof(float), cudaMemcpyDeviceToHost);
printf("Total computation time is %lf \n" , iElaps);
DistanceCPU(array1, array2, narray1, narray2, outputCPU);
float error = 0;
for (long i = 0; i < narray1 * narray2; i++)
{
error += abs(outputCPU[i] - outputGPU[i]);
}
error /= (narray2 * narray1);
for (int i = 0; i < 20; i++)
{
printf("CPU result %f \n", outputCPU[i]);
printf("GPU result %f \n", outputGPU[i]);
}
printf("Error is %f \n", error);
delete [] array1;
delete [] array2;
delete [] outputCPU;
delete [] outputGPU;
return 0;
}
Run Code Online (Sandbox Code Playgroud)
我尝试打印 CPU 和 GPU 的一些计算结果。我得到以下输出。
CPU result 84.315201
GPU result 84.315193
CPU result 48.804039
GPU result 48.804039
CPU result 26.388403
GPU result 26.388403
CPU result 150.009735
GPU result 150.009750
Run Code Online (Sandbox Code Playgroud)
我相信浮动精度足够了,我不知道真正的问题是什么。
我想说这里的主要贡献者是该powf
函数的使用。GPU 上的特定数学函数定义不能保证与 CPU 代码中的相同数学函数具有相同的精度。我不能说这是否是一个足够的甚至适用的描述,因为我们可能必须讨论您正在使用的 CPU 编译器以及编译开关/设置。GPU 数学函数的错误可能性记录在CUDA 编程指南中。
但是,如果您感兴趣的是性能,那么在我看来,使用pow
或powf
调整事物确实没有多大意义。我想既然您问的是有关 GPU 的问题,那么您对性能感兴趣。
powf
据我所知,如果我们用普通的平方运算替换该函数的使用,GPU 结果将变得更接近 CPU 结果。
在 CUDA 10.0、Tesla P100、CentOS 7、gcc 4.8.5 上按原样运行代码的结果:
$ ./t415
CUDA kernel launch with 14 blocks of 512 threads
Total computation time is 0.000038
CPU result 28.795628
GPU result 28.795628
CPU result 50.995567
GPU result 50.995567
CPU result 46.970348
GPU result 46.970345
CPU result 29.031254
GPU result 29.031254
CPU result 111.297745
GPU result 111.297745
CPU result 19.145151
GPU result 19.145151
CPU result 20.508183
GPU result 20.508183
CPU result 133.916077
GPU result 133.916077
CPU result 84.315201
GPU result 84.315193
CPU result 48.804039
GPU result 48.804039
CPU result 26.388403
GPU result 26.388403
CPU result 150.009735
GPU result 150.009750
CPU result 108.421936
GPU result 108.421936
CPU result 73.092339
GPU result 73.092339
CPU result 79.486023
GPU result 79.486023
CPU result 89.990150
GPU result 89.990150
CPU result 20.142567
GPU result 20.142567
CPU result 43.482445
GPU result 43.482445
CPU result 29.460800
GPU result 29.460800
CPU result 86.545860
GPU result 86.545860
Error is 0.000001
Run Code Online (Sandbox Code Playgroud)
修改后的代码,将 powf 替换为普通平方:
$ cat t415.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
// #include <helper_functions.h>
#define VECTORDIM 3
typedef float mt;
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6);
}
void DistanceCPU(mt* array1, mt* array2, int narray1, int narray2, mt* output)
{
mt temp;
for (int i = 0; i < narray1; i++)
{
for (int j = 0; j < narray2; j++)
{
temp = 0;
for (int l = 0; l < VECTORDIM; l++)
{
#ifndef USE_POW
temp += (array1[i + l * narray1] - array2[j + l * narray2])*(array1[i + l * narray1] - array2[j + l * narray2]);
#else
temp += powf(array1[i + l * narray1] - array2[j + l * narray2], 2);
#endif
}
output[i * narray2 + j] = temp;
}
}
}
__global__ void DistGPU(mt* array1, mt* array2, int narray1, int narray2, mt* output)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
mt temp;
if (i < narray1)
{
for (int j = 0; j < narray2; j++)
{
temp = 0;
#ifndef USE_POW
temp += (array1[i] - array2[j])*(array1[i] - array2[j]);
temp += (array1[i + narray1] - array2[j + narray2])*(array1[i + narray1] - array2[j + narray2]);
temp += (array1[i + 2 * narray1] - array2[j + 2 * narray2])*(array1[i + 2 * narray1] - array2[j + 2 * narray2]);
#else
temp += powf(array1[i] - array2[j], 2);
temp += powf(array1[i + narray1] - array2[j + narray2], 2);
temp += powf(array1[i + 2 * narray1] - array2[j + 2 * narray2], 2);
#endif
output[i * narray2 + j] = temp;
}
}
}
int main()
{
int narray1 = 7000;
int narray2 = 60000;
mt* array1 = new mt[narray1 * VECTORDIM];
mt* array2 = new mt[narray2 * VECTORDIM];
mt* outputGPU = new mt[narray1 * narray2];
mt* outputCPU = new mt[narray1 * narray2];
mt* outputCPUTest = new mt[narray1 * narray2];
mt* d_array1;
mt* d_array2;
mt* d_output;
for (int i = 0; i < narray1 * VECTORDIM; i++)
{
array1[i] = static_cast<mt> (rand() / (static_cast<mt> (RAND_MAX / 10)));
// std::cout << "Element " << i << " " << array1[i] << std::endl;
}
for (int i = 0; i < narray2 * VECTORDIM; i++)
{
array2[i] = static_cast<mt> (rand() / (static_cast<mt> (RAND_MAX / 10)));
}
cudaError_t err;
err = cudaMalloc((void**)&d_array1, narray1 * VECTORDIM * sizeof(mt));
err = cudaMalloc((void**)&d_array2, narray2 * VECTORDIM * sizeof(mt));
err = cudaMalloc((void**)&d_output, narray1 * narray2 * sizeof(mt));
err = cudaMemcpy(d_array1, array1, narray1 * VECTORDIM * sizeof(mt), cudaMemcpyHostToDevice);
err = cudaMemcpy(d_array2, array2, narray2 * VECTORDIM * sizeof(mt), cudaMemcpyHostToDevice);
int threadsPerBlock = 512;
int blocksPerGrid = (narray1 + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
double iStart = cpuSecond();
DistGPU<<<blocksPerGrid, threadsPerBlock>>>(d_array1, d_array2, narray1, narray2, d_output);
double iElaps = cpuSecond() - iStart;
err = cudaMemcpy(outputGPU, d_output, narray1 * narray2 * sizeof(mt), cudaMemcpyDeviceToHost);
printf("Total computation time is %lf \n" , iElaps);
DistanceCPU(array1, array2, narray1, narray2, outputCPU);
mt error = 0;
for (long i = 0; i < narray1 * narray2; i++)
{
error += abs(outputCPU[i] - outputGPU[i]);
}
error /= (narray2 * narray1);
for (int i = 0; i < 20; i++)
{
printf("CPU result %f \n", outputCPU[i]);
printf("GPU result %f \n", outputGPU[i]);
}
printf("Error is %f \n", error);
delete [] array1;
delete [] array2;
delete [] outputCPU;
delete [] outputGPU;
return 0;
}
$ nvcc -o t415 t415.cu
t415.cu(87): warning: variable "err" was set but never used
$ ./t415
CUDA kernel launch with 14 blocks of 512 threads
Total computation time is 0.000042
CPU result 28.795628
GPU result 28.795628
CPU result 50.995567
GPU result 50.995567
CPU result 46.970348
GPU result 46.970348
CPU result 29.031254
GPU result 29.031254
CPU result 111.297745
GPU result 111.297745
CPU result 19.145151
GPU result 19.145149
CPU result 20.508183
GPU result 20.508183
CPU result 133.916077
GPU result 133.916077
CPU result 84.315201
GPU result 84.315201
CPU result 48.804039
GPU result 48.804039
CPU result 26.388403
GPU result 26.388403
CPU result 150.009735
GPU result 150.009735
CPU result 108.421936
GPU result 108.421936
CPU result 73.092339
GPU result 73.092331
CPU result 79.486023
GPU result 79.486023
CPU result 89.990150
GPU result 89.990150
CPU result 20.142567
GPU result 20.142567
CPU result 43.482445
GPU result 43.482445
CPU result 29.460800
GPU result 29.460800
CPU result 86.545860
GPU result 86.545860
Error is 0.000000
Run Code Online (Sandbox Code Playgroud)
一些注意事项:
float
与double
计算,以建立哪个数字更接近正确结果的基线。在某些情况下,GPU 生成的数字比相应的 CPU 代码更接近正确结果,因此简单地假设 CPU 代码是正确的,然后要求解释为什么 GPU 代码不同并不总是正确的。正确的做法。 这是此类错误的一个例子。powf( ,2)
cudaDeviceSynchronize();
在内核调用之后立即在计时区域中添加一个调用。编辑:由 @njuffa 提供,他提醒我很容易检查 FMA 收缩假设,如果我们用 重新编译之前修改的代码-fmad=false
,那么我们会观察到(至少就打印输出而言)GPU 和 CPU 之间的相同结果。因此,这意味着 FMA 收缩(在 GPU 方面)可能是导致上一节中剩余的少数差异的最终因素。正如 njuffa 的评论中提到的,FMA 收缩可能会产生更高准确度的结果,因此这里可能的解释是 GPU 结果(使用 FMA 收缩,如之前所示)可能比 CPU 结果更准确。同样,切换到双精度将有助于确认这一点。代码已经设置好,可以通过更改轻松实现这一点 typedef
。无论如何,这是前面的代码(float
,使用普通平方)的输出-fmad=false
:
$ nvcc -o t415 t415.cu -fmad=false
t415.cu(87): warning: variable "err" was set but never used
$ ./t415
CUDA kernel launch with 14 blocks of 512 threads
Total computation time is 0.000039
CPU result 28.795628
GPU result 28.795628
CPU result 50.995567
GPU result 50.995567
CPU result 46.970348
GPU result 46.970348
CPU result 29.031254
GPU result 29.031254
CPU result 111.297745
GPU result 111.297745
CPU result 19.145151
GPU result 19.145151
CPU result 20.508183
GPU result 20.508183
CPU result 133.916077
GPU result 133.916077
CPU result 84.315201
GPU result 84.315201
CPU result 48.804039
GPU result 48.804039
CPU result 26.388403
GPU result 26.388403
CPU result 150.009735
GPU result 150.009735
CPU result 108.421936
GPU result 108.421936
CPU result 73.092339
GPU result 73.092339
CPU result 79.486023
GPU result 79.486023
CPU result 89.990150
GPU result 89.990150
CPU result 20.142567
GPU result 20.142567
CPU result 43.482445
GPU result 43.482445
CPU result 29.460800
GPU result 29.460800
CPU result 86.545860
GPU result 86.545860
Error is 0.000000
Run Code Online (Sandbox Code Playgroud)