cudaMallocManaged 和 cudaDeviceSynchronize()

Question

cudaMallocManaged 和 cudaDeviceSynchronize()

我有以下两个几乎相同的示例代码。code1.cu使用cudaMalloc和cudaMemcpy来处理设备/主机变量值交换。

\n\n

使用code2.cu ，cudaMallocManaged因此cudaMemcpy不需要。当使用 cudaMallocManaged 时，我必须包含 cudaDeviceSynchronize()以获得正确的结果，而对于使用 cudaMalloc 的情况，则不需要这样做。我希望能得到一些关于为什么会发生这种情况的提示

\n\n

代码2.cu

\n\n

#include <iostream>\n#include <math.h>\n#include <vector>\n//\n\nusing namespace std;\n\n\n// Kernel function to do nested loops\n__global__\nvoid add(int max_x, int max_y, float *tot, float *x, float *y)\n{\n    int i = blockIdx.x*blockDim.x + threadIdx.x;\n    int j = blockIdx.y*blockDim.y + threadIdx.y;\n    if(i < max_x && j<max_y) {\n        atomicAdd(tot, x[i] + y[j]);\n    }\n}\n\n\nint main(void)\n{\n    int Nx = 1<<15;\n    int Ny = 1<<15;\n    float *d_x = NULL, *d_y = NULL;\n    float *d_tot = NULL;\n    cudaMalloc((void **)&d_x, sizeof(float)*Nx);\n    cudaMalloc((void **)&d_y, sizeof(float)*Ny);\n    cudaMallocManaged((void **)&d_tot, sizeof(float));\n\n    // Allocate Unified Memory \xe2\x80\x93 accessible from CPU or GPU\n    vector<float> vx;\n    vector<float> vy;\n\n    // initialize x and y arrays on the host\n    for (int i = 0; i < Nx; i++)\n        vx.push_back(i);\n\n    for (int i = 0; i < Ny; i++)\n        vy.push_back(i*10);\n\n    //\n    float tot = 0;\n    for(int i = 0; i<vx.size(); i++)\n        for(int j = 0; j<vy.size(); j++)\n            tot += vx[i] + vy[j];\n\n    cout<<"CPU: tot: "<<tot<<endl;\n\n\n    //\n    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);\n    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);\n\n    //\n    int blockSize;   // The launch configurator returned block size\n    int minGridSize; // The minimum grid size needed to achieve the\n    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);\n\n    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;\n    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation\n    int bx = sqrt(blockSize*Nx/(float)Ny);\n    int by = bx*Ny/(float)Nx;\n    dim3 blockSize_3D(bx, by);\n    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);\n\n    cout<<"blockSize: "<<blockSize<<endl;\n    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;\n\n    // calculate theoretical occupancy\n    int maxActiveBlocks;\n    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);\n\n    int device;\n    cudaDeviceProp props;\n    cudaGetDevice(&device);\n    cudaGetDeviceProperties(&props, device);\n\n    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /\n        (float)(props.maxThreadsPerMultiProcessor /\n                props.warpSize);\n\n    printf("Launched blocks of size %d. Theoretical occupancy: %f\\n",\n            blockSize, occupancy);\n\n\n    // Run kernel on 1M elements on the GPU\n    tot = 0;\n    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);\n\n    // Wait for GPU to finish before accessing on host\n    //cudaDeviceSynchronize();\n\n    tot =*d_tot;\n    //\n\n    //\n    cout<<" GPU: tot: "<<tot<<endl;\n    // Free memory\n    cudaFree(d_x);\n    cudaFree(d_y);\n    cudaFree(d_tot);\n\n    return 0;\n}\n

Run Code Online (Sandbox Code Playgroud)\n\n

代码1.cu

\n\n

#include <iostream>\n#include <math.h>\n#include <vector>\n//\nusing namespace std;\n\n\n// Kernel function to do nested loops\n__global__\nvoid add(int max_x, int max_y, float *tot, float *x, float *y)\n{\n    int i = blockIdx.x*blockDim.x + threadIdx.x;\n    int j = blockIdx.y*blockDim.y + threadIdx.y;\n    if(i < max_x && j<max_y) {\n        atomicAdd(tot, x[i] + y[j]);\n    }\n}\n\n\nint main(void)\n{\n    int Nx = 1<<15;\n    int Ny = 1<<15;\n    float *d_x = NULL, *d_y = NULL;\n    float *d_tot = NULL;\n    cudaMalloc((void **)&d_x, sizeof(float)*Nx);\n    cudaMalloc((void **)&d_y, sizeof(float)*Ny);\n    cudaMalloc((void **)&d_tot, sizeof(float));\n\n    // Allocate Unified Memory \xe2\x80\x93 accessible from CPU or GPU\n    vector<float> vx;\n    vector<float> vy;\n\n    // initialize x and y arrays on the host\n    for (int i = 0; i < Nx; i++)\n        vx.push_back(i);\n\n    for (int i = 0; i < Ny; i++)\n        vy.push_back(i*10);\n\n    //\n    float tot = 0;\n    for(int i = 0; i<vx.size(); i++)\n        for(int j = 0; j<vy.size(); j++)\n            tot += vx[i] + vy[j];\n\n    cout<<"CPU: tot: "<<tot<<endl;\n\n\n    //\n    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);\n    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);\n\n\n    //\n    int blockSize;   // The launch configurator returned block size\n    int minGridSize; // The minimum grid size needed to achieve the\n    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);\n\n    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;\n    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation\n    int bx = sqrt(blockSize*Nx/(float)Ny);\n    int by = bx*Ny/(float)Nx;\n    dim3 blockSize_3D(bx, by);\n    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);\n\n    cout<<"blockSize: "<<blockSize<<endl;\n    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;\n\n    // calculate theoretical occupancy\n    int maxActiveBlocks;\n    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);\n\n    int device;\n    cudaDeviceProp props;\n    cudaGetDevice(&device);\n    cudaGetDeviceProperties(&props, device);\n\n    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /\n        (float)(props.maxThreadsPerMultiProcessor /\n                props.warpSize);\n\n    printf("Launched blocks of size %d. Theoretical occupancy: %f\\n",\n            blockSize, occupancy);\n\n\n    // Run kernel on 1M elements on the GPU\n    tot = 0;\n    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);\n\n    // Wait for GPU to finish before accessing on host\n    //cudaDeviceSynchronize();\n\n    //\n    cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);\n\n    //\n    cout<<" GPU: tot: "<<tot<<endl;\n\n    // Free memory\n    cudaFree(d_x);\n    cudaFree(d_y);\n    cudaFree(d_tot);\n\n    return 0;\n}\n\n\n//Code2.cu has the following output:\n//\n//CPU: tot: 8.79609e+12\n//blockSize: 1024\n//bx: 32 by: 32 gx: 1024 gy: 1025\n//Launched blocks of size 1024. Theoretical occupancy: 1.000000\n//GPU: tot: 0\n

Run Code Online (Sandbox Code Playgroud)\n\n

删除评论后cudaDeviceSynchronize()，

\n\n

\n
GPU：总计：8.79609e+12
\n

\n

Answer 1

Rob*_*lla 5

CUDA 内核启动是异步的。这意味着它们的执行独立于启动它们的 CPU 线程。

由于这种异步启动，当您的 CPU 线程代码开始测试结果时，不能保证 CUDA 内核已完成（甚至启动）。

因此，有必要等到 GPU 内核完成后才cudaDeviceSynchronize()执行此操作。 cudaMemcpy还具有同步效果，因此当您删除cudaMemcpy操作时，您会失去同步，但cudaDeviceSynchronize()会恢复它。

归档时间：	6 年前
查看次数：	1772 次
最近记录：	6 年前