Cri*_*rro 13 c++ performance cuda block slowdown
编辑:似乎,至少在这种情况下,转置网格对L2缓存带宽有负面影响.这是从视觉分析器获得的.原因尚不清楚.
我已经到了需要转置CUDA网格的GPU计算情况.因此,如果block_ {x,y}最初作用于数据区域d_ {x,y},则它现在作用于数据区域d_ {y,x},因此block_ {y,x}将作用于数据区域d_ {x, y}.下图显示了一个示例.

值得一提的是,线程不会在每个块内部进行转置,也就是说,一旦定位了块,threadIdx.x和threadIdx.y值分别以正常方式用于它们的x和y偏移.
据我所知,理论上这个设计应该不会对性能产生任何影响,因为内存合并模式仍然保留,即块内的线程没有转置,只是网格重新排列其块.但是我发现在转置网格时,内核运行大约.比正常情况慢2倍.我做了一个玩具示例来说明情况.
? transpose-grid ./prog 10000 10000 100 0
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
normal_kernel (100 rep).......done: 0.935132 ms
verifying correctness.........ok
? transpose-grid ./prog 10000 10000 100 1
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
transp_kernel (100 rep).......done: 1.980445 ms
verifying correctness.........ok
Run Code Online (Sandbox Code Playgroud)
我真的很感激这个问题的任何解释.以下是重现行为的源代码.
// -----------------------------------
// can compile as nvcc main.cu -o prog
// -----------------------------------
#include <cuda.h>
#include <cstdio>
#define BSIZE2D 32
__global__ void normal_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.y*blockDim.y + threadIdx.y;
const int j = blockIdx.x*blockDim.x + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
__global__ void transp_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.x*blockDim.x + threadIdx.y;
const int j = blockIdx.y*blockDim.y + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
int verify(int *hmat, const int m, const int n){
printf("verifying correctness........."); fflush(stdout);
for(int i=0; i<m*n; ++i){
if(hmat[i] != 1){
fprintf(stderr, "Incorrect value at m[%i,%i] = %i\n", i/n, i%n);
return 0;
}
}
printf("ok\n"); fflush(stdout);
return 1;
}
int main(int argc, char **argv){
if(argc != 5){
printf("\nrun as ./prog m n r t\n\nr = number of repeats\nt = transpose (1 or 0)\n");
exit(EXIT_FAILURE);
}
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int r = atoi(argv[3]);
const int t = atoi(argv[4]);
const unsigned int size = m*n;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float time;
int *hmat, *dmat;
printf("init data....................."); fflush(stdout);
hmat = (int*)malloc(sizeof(int)*(size));
for(int i=0; i<size; ++i){
hmat[i] = 0;
}
printf("done: zero matrix of %i rows x %i cols\n", m, n);
printf("copy data to GPU.............."); fflush(stdout);
cudaMalloc(&dmat, sizeof(int)*(size));
cudaMemcpy(dmat, hmat, sizeof(int)*(size), cudaMemcpyHostToDevice);
printf("done\n");
printf("preparing grid................"); fflush(stdout);
dim3 block(BSIZE2D, BSIZE2D, 1);
dim3 grid;
// if transpose or not
if(t){
grid = dim3((m + BSIZE2D - 1)/BSIZE2D, (n + BSIZE2D - 1)/BSIZE2D, 1);
}
else{
grid = dim3((n + BSIZE2D - 1)/BSIZE2D, (m + BSIZE2D - 1)/BSIZE2D, 1);
}
printf("done: block(%i, %i, %i), grid(%i, %i, %i)\n", block.x, block.y, block.z, grid.x, grid.y, grid.z);
if(t){
printf("transp_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
transp_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
else{
printf("normal_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
normal_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
cudaMemcpy(hmat, dmat, sizeof(int)*size, cudaMemcpyDeviceToHost);
verify(hmat, m, n);
exit(EXIT_SUCCESS);
}
Run Code Online (Sandbox Code Playgroud)
由于我找不到任何关于该主题的文献,所以这里是我的猜测解释,而不是基于经验(我的记忆阅读速度的老问题)。
正如您所写,您的示例保留了内存合并模式,但仅在扭曲级别(连续 32 个线程)上完成。但是为了实现全速,需要在扭曲间级别上进行合并 - 这里的原因尚不清楚,这种合并是否确实完成了,或者缓存和内存在这种情况下工作得更好(可能正如这里所描述的,我们可以更好地利用内存突发模式)。
因此,在您的normal_kernel执行中,不仅合并单个扭曲,而且还合并来自下一个块的扭曲。
为了在您的示例中进行检查,我修改了您的代码以使用不同的块大小,以下是我在 1080Ti 上的结果:
块大小 (32, 32) 与您的相同:
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
normal_kernel (100 rep).......done: 1.020545 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
transp_kernel (100 rep).......done: 1.564084 ms
verifying correctness.........ok
Run Code Online (Sandbox Code Playgroud)
不幸的是,块大小 (64, 16) 我们无法创建 64,64,因为 #threads 限制在一个块中:
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
normal_kernel (100 rep).......done: 1.020420 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
transp_kernel (100 rep).......done: 1.205506 ms
verifying correctness.........ok
Run Code Online (Sandbox Code Playgroud)
块大小(128、8):
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
normal_kernel (100 rep).......done: 1.019547 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
transp_kernel (100 rep).......done: 1.058236 ms
verifying correctness.........ok
Run Code Online (Sandbox Code Playgroud)
我不确定这是否有助于解决您的特定问题,但至少我们有更多数据可以讨论。