NVIDIA CUDA编译器自动循环展开

Dre*_*dok 4 cuda

我不知道NVCC是否足够聪明,可以在这样的循环中自动公开指令级并行性(ILP):

for (int i = 0; i < 8; i++) {
   if (somethingHappens) {
       someVar = someVar & 1 << i;
   }
}
Run Code Online (Sandbox Code Playgroud)

还是应该将其重写为像这样显式公开ILP:

char somevar[8];
for (int i = 0; i < 8; i++) {
       if (somethingHappens) {
           someVar[i] = 1 << i;
       }
    }
//reduce somevar using vaddus4 and 3 logical-ands
Run Code Online (Sandbox Code Playgroud)

其他问题:

  • 开普勒的算术流水线有多深?
  • 我如何才能有效地采取措施,知道这种优化是否值得?在块之前和之后读取时钟寄存器是否足够?

Jac*_*ern 5

为了回答您的问题,我正在考虑四个不同的内核,其中每个线程forn_loop迭代中执行循环。四个内核实现了四种可能的情况:

  1. 迭代次数n_loop在编译时是已知的。
  2. 迭代次数n_loop在编译时是已知的,并且求和是有条件的。
  3. 迭代次数n_loop在运行时已知。
  4. n_loop在运行时已知迭代次数,并执行手动循环展开。

完整的代码如下:

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE 512

#define epsilon 0.5
#define n_loop  8

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/****************************************************/
/* KERNEL #1: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
/****************************************************/
__global__ void testKernel1(float* input, float* output, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum = 0.f;

        for (int i = 0; i < n_loop; i++) { 
            accum = accum + input[n_loop*tid+i];
        }

        output[tid] = accum;

    }

}

/****************************************************/
/* KERNEL #2: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
/****************************************************/
__global__ void testKernel2(float* input, float* output, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum = 0.f;

        for (int i = 0; i < n_loop; i++) if (input[n_loop*tid+i] < epsilon) accum = accum + input[n_loop*tid+i];

        output[tid] = accum;

    }

}

/************************************************/
/* KERNEL #3: NUMBER OF LOOPS KNOWN AT RUN-TIME */
/************************************************/
__global__ void testKernel3(float* input, float* output, int N_loop, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum = 0.f;

        for (int i = 0; i < N_loop; i++) accum = accum + input[N_loop*tid+i];

        output[tid] = accum;

    }

}

/*******************************************************************/
/* KERNEL #4: NUMBER OF LOOPS KNOWN AT RUN-TIME - LOOP UNROLL OF 4 */
/*******************************************************************/
__global__ void testKernel4(float* input, float* output, int N_loop, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum1 = 0.f;
        float accum2 = 0.f;
        float accum3 = 0.f;
        float accum4 = 0.f;

        for (int i = 0; i < N_loop/4; i++) {
            accum1 = accum1 + input[N_loop*tid+i];
            accum2 = accum2 + input[N_loop*tid+i+N_loop/4];
            accum3 = accum3 + input[N_loop*tid+i+2*N_loop/4];
            accum4 = accum4 + input[N_loop*tid+i+3*N_loop/4];
        }

        output[tid] = accum1 + accum2 + accum3 + accum4;

    }

}

int main() {

    const int N = 512*512*32;

    float* input    = (float*) malloc(n_loop*N*sizeof(float));
    float* output   = (float*) malloc(N*sizeof(float));
    float* output2  = (float*) malloc(N*sizeof(float));
    float* outputif = (float*) malloc(N*sizeof(float));

    float* d_input;     gpuErrchk(cudaMalloc((void**)&d_input, n_loop*N*sizeof(float)));
    float* d_output;    gpuErrchk(cudaMalloc((void**)&d_output, N*sizeof(float)));

    srand(time(NULL));
    for (int i=0; i<n_loop*N; i++) input[i] = rand() / (float)RAND_MAX; 

    gpuErrchk(cudaMemcpy(d_input, input, n_loop*N*sizeof(float), cudaMemcpyHostToDevice));

    // --- Host-side computations
    for (int k = 0; k < N; k++) {
        float accum1 = 0.f;
        float accum2 = 0.f;
        for (int i = 0; i < n_loop; i++) {
            accum1 = accum1 + input[n_loop*k+i];
            if (input[n_loop*k+i] < epsilon) accum2 = accum2 + input[n_loop*k+i];
        }
        output[k] = accum1;
        outputif[k] = accum2;
    }

    // --- Device-side computation - kernel1
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    testKernel1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel1 elapsed time:  %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
        if (output[i] != output2[i]) {
            printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
            return 1;
        }
    printf("kernel1: results match!\n");

    // --- Device-side computation - kernel2
    cudaEventRecord(start, 0);

    testKernel2<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel1 elapsed time:  %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
        if (outputif[i] != output2[i]) {
            printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, outputif[i], output2[i]);
            return 1;
        }
    printf("kernel2: results match!\n");

    // --- Device-side computation - kernel3
    cudaEventRecord(start, 0);

    testKernel3<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel3 elapsed time:  %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
        if (output[i] != output2[i]) {
            printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
            return 1;
        }
    printf("kernel3: results match!\n");

    // --- Device-side computation - kernel4
    cudaEventRecord(start, 0);

    testKernel4<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel4 elapsed time:  %3.4f ms \n", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i<N; i++)
        if (abs(output[i] - output2[i]) > 0.0001) {
            printf("Mismatch at i = %d, Host= %f, Device = %f, difference = %f\n", i, output[i], output2[i], output2[i] - output[i]);
            return 1;
            }
    printf("kernel4: results match!\n");

    return 0;

}
Run Code Online (Sandbox Code Playgroud)

现在,让我们分析四种情况下的反汇编代码(与CUDA 6.0一起编译)。我正在考虑针对Fermi架构进行编译。

内核1

     MOV R1, c[0x1][0x100];
     S2R R0, SR_CTAID.X;
     IMUL R2, R0, c[0x0][0x8];
     S2R R3, SR_TID.X;
     IADD R0, R2, R3;
     ISETP.GE.AND P0, PT, R0, c[0x0][0x28], PT;
 @P0 BRA.U 0xd8;
@!P0 IADD R2, R3, R2;
@!P0 ISCADD R2, R2, c[0x0][0x20], 0x5; 
@!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;
@!P0 LD R9, [R2];
@!P0 LD R8, [R2+0x4];
@!P0 LD R7, [R2+0x8];
@!P0 LD R6, [R2+0xc];
@!P0 LD R5, [R2+0x10];
@!P0 LD R4, [R2+0x14];
@!P0 LD R3, [R2+0x18];
@!P0 LD R2, [R2+0x1c];
@!P0 F2F.F32.F32 R9, R9;
@!P0 FADD R8, R9, R8;
@!P0 FADD R7, R8, R7;
@!P0 FADD R6, R7, R6;
@!P0 FADD R5, R6, R5;
@!P0 FADD R4, R5, R4;
@!P0 FADD R3, R4, R3;
@!P0 FADD R2, R3, R2;
@!P0 ST [R0], R2;
     EXIT;
Run Code Online (Sandbox Code Playgroud)

在这种情况下,编译器将完全展开循环。您将看到8不同的load(LD)指令和7不同的add(FADD)指令。

内核2

    MOV R1, c[0x1][0x100];
    S2R R0, SR_CTAID.X;
    IMUL R0, R0, c[0x0][0x8];
    S2R R2, SR_TID.X;
    IADD R3, R0, R2;
    ISETP.GE.AND P0, PT, R3, c[0x0][0x28], PT;
@P0 EXIT;
    IADD R0, R2, R0;
    ISCADD R9, R0, c[0x0][0x20], 0x5;
    LD R0, [R9];
    LD R2, [R9+0x4];
    LD R4, [R9+0x8];
    LD R5, [R9+0xc];
    LD R6, [R9+0x10];
    LD R7, [R9+0x14];
    LD R8, [R9+0x18];
    LD R9, [R9+0x1c];
    FSETP.LT.AND P0, PT, R0, 0.5, PT;
    FSETP.LT.AND P1, PT, R4, 0.5, PT;
    F2F.F32.F32 R0, R0;
    SEL R0, R0, RZ, P0;
    FSETP.LT.AND P0, PT, R2, 0.5, PT;
@P0 FADD R0, R0, R2;
    FSETP.LT.AND P0, PT, R5, 0.5, PT;
@P1 FADD R0, R0, R4;
@P0 FADD R0, R0, R5;
    FSETP.LT.AND P1, PT, R8, 0.5, PT;
    FSETP.LT.AND P0, PT, R6, 0.5, PT;
    FADD R2, R0, R6;
    SEL R2, R2, R0, P0;
    FSETP.LT.AND P0, PT, R7, 0.5, PT;
    ISCADD R0, R3, c[0x0][0x24], 0x2;
@P0 FADD R2, R2, R7;
    FSETP.LT.AND P0, PT, R9, 0.5, PT;
@P1 FADD R2, R2, R8;
@P0 FADD R2, R2, R9;
    ST [R0], R2;
    EXIT;
Run Code Online (Sandbox Code Playgroud)

同样在这种情况下,编译器将完全展开循环。您将再次看到8不同的load(LD)指令和7不同的add(FADD)指令。

内核3

c[0x0][0x30]    = N
c[0x1][0x100]   = BLOCKSIZE
c[0x0][0x8]     = blockDim.x
c[0x0][0x30]    = N_loop
c[0x0][0x20]    = input

/*0000*/         MOV R1, c[0x1][0x100];                           R1 = BLOCKSIZE = 256
/*0008*/         S2R R0, SR_CTAID.X;                              R0 = blockIdx.x
/*0010*/         S2R R2, SR_TID.X;                                R2 = threadIdx.x
/*0018*/         IMAD R0, R0, c[0x0][0x8], R2;                    R0 = tid = blockIDx.x * blockDim.x + threadIdx.x
/*0020*/         ISETP.GE.AND P0, PT, R0, c[0x0][0x34], PT;       P0 = (tid >= N) then EXIT
/*0028*/     @P0 EXIT;
/*0030*/         ISETP.LT.AND P0, PT, RZ, c[0x0][0x30], PT;       P0 = (0 < N_loop)
/*0038*/     @P0 BRA 0x60;
/*0040*/         MOV R4, RZ;
/*0048*/         BRA 0x170;
/*0050*/         NOP;
/*0058*/         NOP;
/*0060*/         MOV R2, c[0x0][0x30];                            R2 = N_loop
/*0068*/         IMUL R3, R0, c[0x0][0x30];                       R3 = tid * N_loop
/*0070*/         MOV32I R6, 0x4;                                  R6 = sizeof(float) = 4
/*0078*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;                P0 = (R2 >= 3)
/*0080*/         IMAD R2.CC, R3, R6, c[0x0][0x20];                R2 = R3 * R6 + input = tid * N_loop * 4 + input
/*0088*/         MOV R4, RZ;                                      R4 = 0
/*0090*/         MOV R5, RZ;                                      R5 = 0
/*0098*/         IMAD.HI.X R3, R3, R6, c[0x0][0x24];              
/*00a0*/    @!P0 BRA 0x128;                               
/*00a8*/         MOV R6, c[0x0][0x30];                            R6 = N_loop
/*00b0*/         IADD R10, R6, -0x3;                              R10 = N_loop - 3
/*00b8*/         NOP;
/*00c0*/         IADD R5, R5, 0x4;                                R5 = R5 + 4 = 4                              
/*00c8*/         LD.E R6, [R2];                                   R6 = input[tid * N_loop]
/*00d0*/         ISETP.LT.AND P0, PT, R5, R10, PT;                P0 = (4 < (N_loop - 3))
/*00d8*/         LD.E R7, [R2+0x4];                               R7 = input[tid * N_loop + 1]
/*00e0*/         LD.E R8, [R2+0x8];                               R8 = input[tid * N_loop + 2]
/*00e8*/         LD.E R9, [R2+0xc];                               R9 = input[tid * N_loop + 3]
/*00f0*/         IADD R2.CC, R2, 0x10;                            R2 = R2 + 16 = R2 + 4 * sizeof(float)
/*00f8*/         IADD.X R3, R3, RZ;                               
/*0100*/         FADD R6, R4, R6;                                 R6 = 0 + input[tid * N_loop]
/*0108*/         FADD R4, R6, R7;                                 R4 = input[tid * N_loop] + input[tid * N_loop + 1]
/*0110*/         FADD R8, R4, R8;                                 R8 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2]
/*0118*/         FADD R4, R8, R9;                                 R4 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2] + input[tid * N_loop + 3]
/*0120*/     @P0 BRA 0xc0;                                        ...
/*0128*/         ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
/*0130*/    @!P0 BRA 0x170;
/*0138*/         IADD R5, R5, 0x1;
/*0140*/         LD.E R6, [R2];
/*0148*/         ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
/*0150*/         IADD R2.CC, R2, 0x4;
/*0158*/         IADD.X R3, R3, RZ;
/*0160*/         FADD R4, R4, R6;
/*0168*/     @P0 BRA 0x138;
/*0170*/         MOV32I R3, 0x4;
/*0178*/         IMAD R2.CC, R0, R3, c[0x0][0x28];
/*0180*/         IMAD.HI.X R3, R0, R3, c[0x0][0x2c];
/*0188*/         ST.E [R2], R4;
/*0190*/         EXIT;
Run Code Online (Sandbox Code Playgroud)

可以看出,编译器自动执行的循环展开4,如我所见,4加载操作(LD)和3不同的adds(FADD

内核4

/*0000*/         MOV R1, c[0x1][0x100];
/*0008*/         S2R R0, SR_CTAID.X;
/*0010*/         S2R R2, SR_TID.X;
/*0018*/         IMAD R13, R0, c[0x0][0x8], R2;
/*0020*/         ISETP.GE.AND P0, PT, R13, c[0x0][0x34], PT;
/*0028*/     @P0 EXIT;
/*0030*/         MOV R2, c[0x0][0x30];
/*0038*/         SHR R0, R2, 0x1f;
/*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*0048*/         IMAD.U32.U32.HI R0, R0, 0x4, R2;
/*0050*/         SHR R0, R0, 0x2;
/*0058*/     @P0 BRA 0x98;
/*0060*/         MOV R18, RZ;
/*0068*/         MOV R19, RZ;
/*0070*/         MOV R10, RZ;
/*0078*/         MOV R11, RZ;
/*0080*/         BRA 0x308;
/*0088*/         NOP;
/*0090*/         NOP;
/*0098*/         MOV R3, c[0x0][0x30];
/*00a0*/         IMUL R4, R13, c[0x0][0x30];
/*00a8*/         MOV32I R5, 0x4;
/*00b0*/         IMUL R2, R3, 0x3;
/*00b8*/         SHL R6, R3, 0x1;
/*00c0*/         IADD R10, R0, R4;
/*00c8*/         SHR R3, R2, 0x1f;
/*00d0*/         IMAD R8.CC, R4, R5, c[0x0][0x20];
/*00d8*/         SHR R7, R6, 0x1f;
/*00e0*/         IMAD.U32.U32.HI R2, R3, 0x4, R2;
/*00e8*/         IMAD.HI.X R9, R4, R5, c[0x0][0x24];
/*00f0*/         IMAD.U32.U32.HI R7, R7, 0x4, R6;
/*00f8*/         IMAD.HI R3, R2, c[0x10][0x0], R4;
/*0100*/         IMAD R6.CC, R10, R5, c[0x0][0x20];
/*0108*/         ISETP.GT.AND P0, PT, R0, 0x1, PT;
/*0110*/         IMAD.HI R14, R7, c[0x10][0x0], R4;
/*0118*/         MOV R18, RZ;
/*0120*/         IMAD.HI.X R7, R10, R5, c[0x0][0x24];
/*0128*/         MOV R19, RZ;
/*0130*/         IMAD R2.CC, R3, R5, c[0x0][0x20];
/*0138*/         MOV R10, RZ;
/*0140*/         IMAD.HI.X R3, R3, R5, c[0x0][0x24];
/*0148*/         MOV R11, RZ;
/*0150*/         IMAD R4.CC, R14, R5, c[0x0][0x20];
/*0158*/         MOV R12, RZ;
/*0160*/         IMAD.HI.X R5, R14, R5, c[0x0][0x24];
/*0168*/    @!P0 BRA 0x260;
/*0170*/         IADD R16, R0, -0x1;
/*0178*/         NOP;
/*0180*/         IADD R12, R12, 0x2;
/*0188*/         LD.E R15, [R8];
/*0190*/         ISETP.LT.AND P0, PT, R12, R16, PT;
/*0198*/         LD.E R20, [R6];
/*01a0*/         FADD R17, R18, R15;
/*01a8*/         LD.E R14, [R4];
/*01b0*/         FADD R19, R19, R20;
/*01b8*/         LD.E R15, [R2];
/*01c0*/         LD.E R18, [R8+0x4];
/*01c8*/         LD.E R20, [R6+0x4];
/*01d0*/         IADD R6.CC, R6, 0x8;
/*01d8*/         NOP;
/*01e0*/         FADD R14, R10, R14;
/*01e8*/         FADD R15, R11, R15;
/*01f0*/         IADD.X R7, R7, RZ;
/*01f8*/         LD.E R10, [R4+0x4];
/*0200*/         IADD R4.CC, R4, 0x8;
/*0208*/         LD.E R11, [R2+0x4];
/*0210*/         IADD.X R5, R5, RZ;
/*0218*/         FADD R18, R17, R18;
/*0220*/         IADD R2.CC, R2, 0x8;
/*0228*/         FADD R19, R19, R20;
/*0230*/         IADD.X R3, R3, RZ;
/*0238*/         IADD R8.CC, R8, 0x8;
/*0240*/         IADD.X R9, R9, RZ;
/*0248*/         FADD R10, R14, R10;
/*0250*/         FADD R11, R15, R11;
/*0258*/     @P0 BRA 0x180;
/*0260*/         ISETP.LT.AND P0, PT, R12, R0, PT;
/*0268*/    @!P0 BRA 0x308;
/*0270*/         IADD R12, R12, 0x1;
/*0278*/         LD.E R17, [R8];
/*0280*/         ISETP.LT.AND P0, PT, R12, R0, PT;
/*0288*/         LD.E R16, [R6];
/*0290*/         IADD R6.CC, R6, 0x4;
/*0298*/         LD.E R15, [R4];
/*02a0*/         IADD.X R7, R7, RZ;
/*02a8*/         LD.E R14, [R2];
/*02b0*/         IADD R4.CC, R4, 0x4;
/*02b8*/         IADD.X R5, R5, RZ;
/*02c0*/         IADD R2.CC, R2, 0x4;
/*02c8*/         IADD.X R3, R3, RZ;
/*02d0*/         IADD R8.CC, R8, 0x4;
/*02d8*/         IADD.X R9, R9, RZ;
/*02e0*/         FADD R18, R18, R17;
/*02e8*/         FADD R19, R19, R16;
/*02f0*/         FADD R10, R10, R15;
/*02f8*/         FADD R11, R11, R14;
/*0300*/     @P0 BRA 0x270;
/*0308*/         FADD R0, R18, R19;
/*0310*/         MOV32I R3, 0x4;
/*0318*/         FADD R0, R0, R10;
/*0320*/         IMAD R2.CC, R13, R3, c[0x0][0x28];
/*0328*/         FADD R0, R0, R11;
/*0330*/         IMAD.HI.X R3, R13, R3, c[0x0][0x2c];
/*0338*/         ST.E [R2], R0;
/*0340*/         EXIT;
Run Code Online (Sandbox Code Playgroud)

In this case, the compiler automatically performs a loop unroll of 4, which superimposes to the manual loop unroll of 4, as I see 8 load operations (LD) and 7 different adds (FADD).

Although the disassembled codes are different from those for the Fermi architecture, the compiler behavior is similar also for the Kepler architecture.

Due to the automatic loop unrolling capabilities, there is not much difference in performance between the different kernels:

GT 210 (c.c. 1.2)

Kernel 1 = 111ms
Kernel 2 = 108ms
Kernel 3 = 107ms
Kernel 4 = 110ms

Kepler K20c (c.c. 3.5)

Kernel 1 = 1.8ms
Kernel 2 = 1.8ms
Kernel 3 = 1.8ms
Kernel 4 = 1.8ms
Run Code Online (Sandbox Code Playgroud)

I'm not explictly providing results for the Fermi architecture, but the timing is approximately the same for the four considered kernels.