定义了如何处理错误:
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
Run Code Online (Sandbox Code Playgroud)
通常,要将结果存储在类型为double的大小为N的数组d_results中,可以立即在GPU内存中分配,我们可以设法将数据从设备传输到主机,如下所示:
double *d_results;
HANDLE_ERROR(cudaMalloc(&d_results,N*sizeof(double)));
//Launch our kernel to do some computations and store the results in d_results
.....
// and transfer our data from the device to the host
vector<double> results(N);
cudaMemcpy(results.data(),d_results,N*sizeof(double),cudaMemcpyDeviceToHost);
Run Code Online (Sandbox Code Playgroud)
如果第二行失败,因为没有足够的内存来一次存储所有结果.如何设法进行计算并将结果正确传输到主机?是否必须按批次进行计算?我宁愿避免手动批处理.在CUDA中管理这种情况的标准方法是什么?
批处理是最好的方法。如果您执行以下操作,则可以自动化大部分批处理过程:
#include <assert.h>
#include <iostream>
int main()
{
// Allocate 4 Gb array on host
const size_t N = 1 << 30;
int * data = new int[N];
// Allocate as much memory as will fit on GPU
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
const size_t MB = 1 << 20;
cudaError_t status;
int *buffer;
size_t buffer_size = free_mem;
for(; buffer_size > MB; buffer_size -= MB) {
status = cudaMalloc((void **)&buffer, buffer_size);
if (status == cudaSuccess)
break;
}
std::cout << "Allocated " << buffer_size << " bytes on GPU" << std::endl;
// Loop through host source data in batches
std::cout << N << " items require processing" << std::endl;
size_t batchN = buffer_size / sizeof(int);
size_t remainN = N;
int * dp = data;
std::cout << "Using batch size " << batchN << std::endl;
for(; remainN > 0; remainN -= batchN) {
batchN = (remainN < batchN) ? remainN : batchN;
size_t worksize = batchN * sizeof(int);
std::cout << "Processing batch of size " << batchN;
std::cout << "," << remainN << " items remaining" << std::endl;
cudaMemcpy(buffer, dp, worksize, cudaMemcpyHostToDevice);
cudaMemset(buffer, 0xff, worksize);
cudaMemcpy(dp, buffer, worksize, cudaMemcpyDeviceToHost);
dp += batchN;
}
for(size_t i = 0; i < N; i++) {
assert(data[i] == 0xffffffff);
}
cudaDeviceReset();
return 0;
}
Run Code Online (Sandbox Code Playgroud)
这基本上是
在上面的代码中,我使用 cudaMemset 作为真实内核的代理,但它让您了解需要什么。如果您想要更高级,您可以使用两个缓冲区和流(带有注册/固定的主机内存)并异步复制以获得计算/复制重叠,这将提高非平凡情况下的整体性能。