为什么转换CUDA网格(但不是它的线程块)仍然会减慢计算速度？

Question

为什么转换CUDA网格(但不是它的线程块)仍然会减慢计算速度？

Cri*_*rro 13 c++ performance cuda block slowdown

编辑:似乎,至少在这种情况下,转置网格对L2缓存带宽有负面影响.这是从视觉分析器获得的.原因尚不清楚.

我已经到了需要转置CUDA网格的GPU计算情况.因此,如果block_ {x,y}最初作用于数据区域d_ {x,y},则它现在作用于数据区域d_ {y,x},因此block_ {y,x}将作用于数据区域d_ {x, y}.下图显示了一个示例.

值得一提的是,线程不会在每个块内部进行转置,也就是说,一旦定位了块,threadIdx.x和threadIdx.y值分别以正常方式用于它们的x和y偏移.

据我所知,理论上这个设计应该不会对性能产生任何影响,因为内存合并模式仍然保留,即块内的线程没有转置,只是网格重新排列其块.但是我发现在转置网格时,内核运行大约.比正常情况慢2倍.我做了一个玩具示例来说明情况.

?  transpose-grid ./prog 10000 10000 100 0
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
normal_kernel (100 rep).......done: 0.935132 ms
verifying correctness.........ok
?  transpose-grid ./prog 10000 10000 100 1
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
transp_kernel (100 rep).......done: 1.980445 ms
verifying correctness.........ok

Run Code Online (Sandbox Code Playgroud)

我真的很感激这个问题的任何解释.以下是重现行为的源代码.

 // -----------------------------------
 // can compile as nvcc main.cu -o prog
 // -----------------------------------

 #include <cuda.h>
 #include <cstdio>

 #define BSIZE2D 32

 __global__ void normal_kernel(int *dmat, const int m, const int n){
     const int i = blockIdx.y*blockDim.y + threadIdx.y;
     const int j = blockIdx.x*blockDim.x + threadIdx.x;
     if(i < m && j < n){
         dmat[i*n + j] = 1;
     }
 }

 __global__ void transp_kernel(int *dmat, const int m, const int n){
     const int i = blockIdx.x*blockDim.x + threadIdx.y;
     const int j = blockIdx.y*blockDim.y + threadIdx.x;
     if(i < m && j < n){
         dmat[i*n + j] = 1;
     }
 }



 int verify(int *hmat, const int m, const int n){
     printf("verifying correctness........."); fflush(stdout);
     for(int i=0; i<m*n; ++i){
         if(hmat[i] != 1){
             fprintf(stderr, "Incorrect value at m[%i,%i] = %i\n", i/n, i%n);
             return 0;
         }
     }
     printf("ok\n"); fflush(stdout);
     return 1;
 }
 int main(int argc, char **argv){
     if(argc != 5){
         printf("\nrun as ./prog m n r t\n\nr = number of repeats\nt = transpose (1 or 0)\n");
         exit(EXIT_FAILURE);
     }
     const int m = atoi(argv[1]);
     const int n = atoi(argv[2]);
     const int r = atoi(argv[3]);
     const int t = atoi(argv[4]);
     const unsigned int size = m*n;
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
     float time;
     int *hmat, *dmat;


     printf("init data....................."); fflush(stdout);
     hmat = (int*)malloc(sizeof(int)*(size));
     for(int i=0; i<size; ++i){
         hmat[i] = 0;
     }
     printf("done: zero matrix of %i rows x %i cols\n", m, n);




     printf("copy data to GPU.............."); fflush(stdout);
     cudaMalloc(&dmat, sizeof(int)*(size));
     cudaMemcpy(dmat, hmat, sizeof(int)*(size), cudaMemcpyHostToDevice);
     printf("done\n");



     printf("preparing grid................"); fflush(stdout);
     dim3 block(BSIZE2D, BSIZE2D, 1);
     dim3 grid;
     // if transpose or not
     if(t){
         grid = dim3((m + BSIZE2D - 1)/BSIZE2D, (n + BSIZE2D - 1)/BSIZE2D, 1);
     }
     else{
         grid = dim3((n + BSIZE2D - 1)/BSIZE2D, (m + BSIZE2D - 1)/BSIZE2D, 1);
     }
     printf("done: block(%i, %i, %i), grid(%i, %i, %i)\n", block.x, block.y, block.z, grid.x, grid.y, grid.z);


     if(t){
         printf("transp_kernel (%3i rep).......", r); fflush(stdout);
         cudaEventRecord(start, 0);
         for(int i=0; i<r; ++i){
             transp_kernel<<<grid, block>>>(dmat, m, n);
             cudaDeviceSynchronize();
         }
         cudaEventRecord(stop,0);
         cudaEventSynchronize(stop);
         cudaEventElapsedTime(&time, start, stop); // that's our time!
         printf("done: %f ms\n", time/(float)r);
     }
     else{
         printf("normal_kernel (%3i rep).......", r); fflush(stdout);
         cudaEventRecord(start, 0);
         for(int i=0; i<r; ++i){
             normal_kernel<<<grid, block>>>(dmat, m, n);
             cudaDeviceSynchronize();
         }
         cudaEventRecord(stop,0);
         cudaEventSynchronize(stop);
         cudaEventElapsedTime(&time, start, stop); // that's our time!
         printf("done: %f ms\n", time/(float)r);
     }


     cudaMemcpy(hmat, dmat, sizeof(int)*size, cudaMemcpyDeviceToHost);
     verify(hmat, m, n);
     exit(EXIT_SUCCESS);
 }

Run Code Online (Sandbox Code Playgroud)

Answer 1

Krz*_*tof 1

由于我找不到任何关于该主题的文献，所以这里是我的猜测解释，而不是基于经验（我的记忆阅读速度的老问题）。

正如您所写，您的示例保留了内存合并模式，但仅在扭曲级别（连续 32 个线程）上完成。但是为了实现全速，需要在扭曲间级别上进行合并 - 这里的原因尚不清楚，这种合并是否确实完成了，或者缓存和内存在这种情况下工作得更好（可能正如这里所描述的，我们可以更好地利用内存突发模式）。

因此，在您的normal_kernel执行中，不仅合并单个扭曲，而且还合并来自下一个块的扭曲。

为了在您的示例中进行检查，我修改了您的代码以使用不同的块大小，以下是我在 1080Ti 上的结果：

块大小 (32, 32) 与您的相同：

~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
normal_kernel (100 rep).......done: 1.020545 ms
verifying correctness.........ok

~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
transp_kernel (100 rep).......done: 1.564084 ms
verifying correctness.........ok

Run Code Online (Sandbox Code Playgroud)

不幸的是，块大小 (64, 16) 我们无法创建 64,64，因为 #threads 限制在一个块中：

~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
normal_kernel (100 rep).......done: 1.020420 ms
verifying correctness.........ok

~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
transp_kernel (100 rep).......done: 1.205506 ms
verifying correctness.........ok

Run Code Online (Sandbox Code Playgroud)

块大小（128、8）：

~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
normal_kernel (100 rep).......done: 1.019547 ms
verifying correctness.........ok

~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
transp_kernel (100 rep).......done: 1.058236 ms
verifying correctness.........ok

Run Code Online (Sandbox Code Playgroud)

我不确定这是否有助于解决您的特定问题，但至少我们有更多数据可以讨论。

归档时间：	8 年前
查看次数：	304 次
最近记录：	6 年，11 月前