当设备在此过程中处于活动状态时,无法设置 CUDA 固定内存实现错误

A23*_*577 0 c++ cuda

我想在我的代码中实现 GPU 的固定内存功能。为此,我编写代码如下:

bool addVectorGPU(float* M, float* N, float* P, int size)
{
// Error return value
cudaError_t status;
cudaSetDeviceFlags(cudaDeviceMapHost);
// Number of bytes in the matrix.
int bytes = size * sizeof(float);
// Pointers to the device arrays
float *Md, *Nd, *Pd;
// Allocate memory on the device to store each matrix

cudaHostAlloc((void**)&M, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&N, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&P, bytes, cudaHostAllocMapped);
// Copy the host input data to the device

cudaHostGetDevicePointer((void**)&Md, M, 0);
cudaHostGetDevicePointer((void**)&Nd, N, 0);
cudaHostGetDevicePointer((void**)&Pd, P, 0);

// Specify the size of the grid and the size of the block
dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE)); 
// Launch the kernel on a size-by-size block of threads
addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
// Wait for completion
cudaThreadSynchronize();
cudaDeviceSynchronize();
// Check for errors
status = cudaGetLastError();
if (status != cudaSuccess) {
std::cout << "Kernel failed: " << cudaGetErrorString(status) <<
std::endl;
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);

return false;
}
// Retrieve the result matrix
//cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Free device memory
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
// Success
return true;
}
Run Code Online (Sandbox Code Playgroud)

现在,为了评估我的设备上的性能,我调用此函数 1000 次,然后计算运行所需的平均时间:

int main(){
// Timing data
float tcpuadd, tcpusub, tcpuscale, tgpuadd, tgpusub, tgpuscale, sum, delta, L2norm;
clock_t start, end;
bool success;

//Allocate the four vectors of SIZE floats
float* M = new float[SIZE];
float* N = new float[SIZE];
float* Pcpu = new float[SIZE];
float* Pgpu = new float[SIZE];
//Initialize M and N to random integers
for (int i = 0; i < SIZE; i ++){
M[i] = (float) rand()/(RAND_MAX);
N[i] = (float) rand()/(RAND_MAX);
}
printf("Operating on a vector of length %d\n", SIZE);
//Add two vectors and compute timing in CPU
start = clock();
for (int i = 0; i < ITERS; i++) {
addVectorCPU(M, N, Pcpu, SIZE);
}

end = clock();
tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
printf( "CPU Addition took %f ms\n", tcpuadd);
//Add two vectors and compute timing in GPU
success = addVectorGPU(M, N ,Pgpu , SIZE);
if(!success)
{
    printf("Device Error!\n");
    return 1;
}
//compute GPU timing
start = clock();
for (int i = 0; i < ITERS; i++) {
addVectorGPU(M, N, Pgpu, SIZE);
}
end = clock();
tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
printf("GPU Addition took %f ms\n", tgpuadd);
Run Code Online (Sandbox Code Playgroud)

问题是,这个函数第一次运行时没有任何错误。但是当我第二次调用这个函数时,我得到了错误:

cannot set when device is active in this process
Run Code Online (Sandbox Code Playgroud)

那么有人知道这到底是怎么回事吗?

Rob*_*lla 5

如果您通过检查每个运行时 API 调用的返回值来更好地进行 cuda 错误检查,您会发现第二次调用时会返回此错误:

cudaSetDeviceFlags(cudaDeviceMapHost);
Run Code Online (Sandbox Code Playgroud)

请注意此运行时 API 调用的描述:

如果当前设备已设置并且该设备已初始化,则此调用将失败并出现错误 cudaErrorSetOnActiveProcess。

解决方案是仅在应用程序开始时调用该函数一次,而不是每次调用该addVectorGPU函数时。在第一次调用 之前,将该调用从函数中取出addVectorGPU,并将其放入您的例程中。mainaddVectorGPU

根据以下问题,代码还存在各种其他问题:

  1. 我建议对所有内核调用和所有 CUDA API 调用实施适当的 cuda 错误检查,而不是在例程结束时进行一次检查。

  2. 的用法cudaHostAlloc不正确。该程序的目的似乎是将指向主机驻留数据的主机指针传递给 GPU 例程,然后使用零复制技术添加该数据。这在技术上是可行的(虽然会很慢),但正确的方法会涉及使用cudaHostRegister, not cudaHostAlloccudaHostAlloc创建一个新的分配,因此传递给该函数的现有数据不会以这种方式使用或引用。

这是一个基于您所展示内容的有效示例。请注意,我个人不会以这种方式进行基准测试,但我提供这一点是为了表明该过程可以以无错误的方式工作:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>

#define TILE_SIZE 512
#define SIZE 1048576
#define ITERS 10

bool addVectorCPU(float *M, float *N, float *P, int size){

  for (int i=0; i< size; i++) P[i] = M[i]+N[i];
  return true;
}
__global__ void addVectorKernel(float *M, float *N, float *P,int  size){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < size)
    P[idx] = M[idx]+N[idx];
}

bool addVectorGPU(float* M, float* N, float* P, int size)
{
// Error return value
  cudaError_t status;
// Number of bytes in the matrix.
  int bytes = size * sizeof(float);
// Pointers to the device arrays
  float *Md, *Nd, *Pd;
// Allocate memory on the device to store each matrix

  cudaHostRegister(M, bytes, cudaHostRegisterMapped);
  cudaHostRegister(N, bytes, cudaHostRegisterMapped);
  cudaHostRegister(P, bytes, cudaHostRegisterMapped);
// Copy the host input data to the device

  cudaHostGetDevicePointer((void**)&Md, M, 0);
  cudaHostGetDevicePointer((void**)&Nd, N, 0);
  cudaHostGetDevicePointer((void**)&Pd, P, 0);

// Specify the size of the grid and the size of the block
  dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
  dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE));
// Launch the kernel on a size-by-size block of threads
  addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
// Wait for completion
  cudaDeviceSynchronize();
  bool res = true;
// Check for errors
  status = cudaGetLastError();
  if (status != cudaSuccess) {
    std::cout << "Kernel failed: " << cudaGetErrorString(status) << std::endl;

    res = false;
    }
// Retrieve the result matrix
//cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Free device memory
  cudaHostUnregister(M);
  cudaHostUnregister(N);
  cudaHostUnregister(P);
// Success
  return res;
}

int main(){
// Timing data
  float tcpuadd, tgpuadd;
  clock_t start, end;
  bool success;

//Allocate the four vectors of SIZE floats
  float* M = new float[SIZE];
  float* N = new float[SIZE];
  float* Pcpu = new float[SIZE];
  float* Pgpu = new float[SIZE];
//Initialize M and N to random integers
  for (int i = 0; i < SIZE; i ++){
    M[i] = rand()/(float)(RAND_MAX);
    N[i] = rand()/(float)(RAND_MAX);
    }
  printf("Operating on a vector of length %d\n", SIZE);
//Add two vectors and compute timing in CPU
  start = clock();
  for (int i = 0; i < ITERS; i++) {
    addVectorCPU(M, N, Pcpu, SIZE);
    }

  end = clock();
  tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
  printf( "CPU Addition took %f ms\n", tcpuadd);
//Add two vectors and compute timing in GPU
  cudaSetDeviceFlags(cudaDeviceMapHost);
  success = addVectorGPU(M, N ,Pgpu , SIZE);
  if(!success)
    {
    printf("Device Error!\n");
    return 1;
    }
//compute GPU timing
  start = clock();
  for (int i = 0; i < ITERS; i++) {
    addVectorGPU(M, N, Pgpu, SIZE);
    }
  end = clock();
  tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
  printf("GPU Addition took %f ms\n", tgpuadd);
}
Run Code Online (Sandbox Code Playgroud)

请注意,我还做了一些其他更改。例如cudaThreadSynchronize()已弃用,并且没有必要同时使用cudaThreadSynchronize()cudaDeviceSynchronize();它们是多余的。