我有以下两个几乎相同的示例代码。code1.cu使用cudaMalloc和cudaMemcpy来处理设备/主机变量值交换。
使用code2.cu ,cudaMallocManaged因此cudaMemcpy不需要。当使用 cudaMallocManaged 时,我必须包含 cudaDeviceSynchronize()以获得正确的结果,而对于使用 cudaMalloc 的情况,则不需要这样做。我希望能得到一些关于为什么会发生这种情况的提示
代码2.cu
\n\n#include <iostream>\n#include <math.h>\n#include <vector>\n//\n\nusing namespace std;\n\n\n// Kernel function to do nested loops\n__global__\nvoid add(int max_x, int max_y, float *tot, float *x, float *y)\n{\n int i = blockIdx.x*blockDim.x + threadIdx.x;\n int j = blockIdx.y*blockDim.y + threadIdx.y;\n if(i < max_x && j<max_y) {\n atomicAdd(tot, x[i] + y[j]);\n }\n}\n\n\nint main(void)\n{\n int Nx = 1<<15;\n int Ny = 1<<15;\n float *d_x = NULL, *d_y = NULL;\n float *d_tot = NULL;\n cudaMalloc((void **)&d_x, sizeof(float)*Nx);\n cudaMalloc((void **)&d_y, sizeof(float)*Ny);\n cudaMallocManaged((void **)&d_tot, sizeof(float));\n\n // Allocate Unified Memory \xe2\x80\x93 accessible from CPU or GPU\n vector<float> vx;\n vector<float> vy;\n\n // initialize x and y arrays on the host\n for (int i = 0; i < Nx; i++)\n vx.push_back(i);\n\n for (int i = 0; i < Ny; i++)\n vy.push_back(i*10);\n\n //\n float tot = 0;\n for(int i = 0; i<vx.size(); i++)\n for(int j = 0; j<vy.size(); j++)\n tot += vx[i] + vy[j];\n\n cout<<"CPU: tot: "<<tot<<endl;\n\n\n //\n cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);\n cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);\n\n //\n int blockSize; // The launch configurator returned block size\n int minGridSize; // The minimum grid size needed to achieve the\n cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);\n\n //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;\n //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation\n int bx = sqrt(blockSize*Nx/(float)Ny);\n int by = bx*Ny/(float)Nx;\n dim3 blockSize_3D(bx, by);\n dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);\n\n cout<<"blockSize: "<<blockSize<<endl;\n cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;\n\n // calculate theoretical occupancy\n int maxActiveBlocks;\n cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);\n\n int device;\n cudaDeviceProp props;\n cudaGetDevice(&device);\n cudaGetDeviceProperties(&props, device);\n\n float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /\n (float)(props.maxThreadsPerMultiProcessor /\n props.warpSize);\n\n printf("Launched blocks of size %d. Theoretical occupancy: %f\\n",\n blockSize, occupancy);\n\n\n // Run kernel on 1M elements on the GPU\n tot = 0;\n add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);\n\n // Wait for GPU to finish before accessing on host\n //cudaDeviceSynchronize();\n\n tot =*d_tot;\n //\n\n //\n cout<<" GPU: tot: "<<tot<<endl;\n // Free memory\n cudaFree(d_x);\n cudaFree(d_y);\n cudaFree(d_tot);\n\n return 0;\n}\nRun Code Online (Sandbox Code Playgroud)\n\n代码1.cu
\n\n#include <iostream>\n#include <math.h>\n#include <vector>\n//\nusing namespace std;\n\n\n// Kernel function to do nested loops\n__global__\nvoid add(int max_x, int max_y, float *tot, float *x, float *y)\n{\n int i = blockIdx.x*blockDim.x + threadIdx.x;\n int j = blockIdx.y*blockDim.y + threadIdx.y;\n if(i < max_x && j<max_y) {\n atomicAdd(tot, x[i] + y[j]);\n }\n}\n\n\nint main(void)\n{\n int Nx = 1<<15;\n int Ny = 1<<15;\n float *d_x = NULL, *d_y = NULL;\n float *d_tot = NULL;\n cudaMalloc((void **)&d_x, sizeof(float)*Nx);\n cudaMalloc((void **)&d_y, sizeof(float)*Ny);\n cudaMalloc((void **)&d_tot, sizeof(float));\n\n // Allocate Unified Memory \xe2\x80\x93 accessible from CPU or GPU\n vector<float> vx;\n vector<float> vy;\n\n // initialize x and y arrays on the host\n for (int i = 0; i < Nx; i++)\n vx.push_back(i);\n\n for (int i = 0; i < Ny; i++)\n vy.push_back(i*10);\n\n //\n float tot = 0;\n for(int i = 0; i<vx.size(); i++)\n for(int j = 0; j<vy.size(); j++)\n tot += vx[i] + vy[j];\n\n cout<<"CPU: tot: "<<tot<<endl;\n\n\n //\n cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);\n cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);\n\n\n //\n int blockSize; // The launch configurator returned block size\n int minGridSize; // The minimum grid size needed to achieve the\n cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);\n\n //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;\n //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation\n int bx = sqrt(blockSize*Nx/(float)Ny);\n int by = bx*Ny/(float)Nx;\n dim3 blockSize_3D(bx, by);\n dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);\n\n cout<<"blockSize: "<<blockSize<<endl;\n cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;\n\n // calculate theoretical occupancy\n int maxActiveBlocks;\n cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);\n\n int device;\n cudaDeviceProp props;\n cudaGetDevice(&device);\n cudaGetDeviceProperties(&props, device);\n\n float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /\n (float)(props.maxThreadsPerMultiProcessor /\n props.warpSize);\n\n printf("Launched blocks of size %d. Theoretical occupancy: %f\\n",\n blockSize, occupancy);\n\n\n // Run kernel on 1M elements on the GPU\n tot = 0;\n add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);\n\n // Wait for GPU to finish before accessing on host\n //cudaDeviceSynchronize();\n\n //\n cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);\n\n //\n cout<<" GPU: tot: "<<tot<<endl;\n\n // Free memory\n cudaFree(d_x);\n cudaFree(d_y);\n cudaFree(d_tot);\n\n return 0;\n}\n\n\n//Code2.cu has the following output:\n//\n//CPU: tot: 8.79609e+12\n//blockSize: 1024\n//bx: 32 by: 32 gx: 1024 gy: 1025\n//Launched blocks of size 1024. Theoretical occupancy: 1.000000\n//GPU: tot: 0\nRun Code Online (Sandbox Code Playgroud)\n\n删除评论后cudaDeviceSynchronize(),
\n\nGPU:总计:8.79609e+12
\n
CUDA 内核启动是异步的。这意味着它们的执行独立于启动它们的 CPU 线程。
由于这种异步启动,当您的 CPU 线程代码开始测试结果时,不能保证 CUDA 内核已完成(甚至启动)。
因此,有必要等到 GPU 内核完成后才cudaDeviceSynchronize()执行此操作。 cudaMemcpy还具有同步效果,因此当您删除cudaMemcpy操作时,您会失去同步,但cudaDeviceSynchronize()会恢复它。
| 归档时间: |
|
| 查看次数: |
1772 次 |
| 最近记录: |