NVCC 不会展开小的嵌套循环

dar*_*ari 4 cuda

我想知道,为什么 NVCC 无法为小矩阵 (N=4) 展开以下 Cholesky 分解内核。

template<typename T, int N>
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {
            double s = 0;
#pragma unroll
            for (int k = 0; k < j; k++){
                s += C[i*N+k] * C[j*N+k];
            }
            s = C[i*N+j] - s;
            C[i*N+j] = (i == j) ?
                        sqrt(s) :
                        (1.0 / C[j*N+j] * (s));
        }
    }
}
Run Code Online (Sandbox Code Playgroud)

生成的 PTX 代码如下所示:

sqrt.rn.f64     %fd12, %fd29;
    st.local.f64    [%rd1], %fd12;
    rcp.rn.f64  %fd34, %fd12;
    mul.f64     %fd13, %fd30, %fd34;
    st.local.f64    [%rd1+32], %fd13;
    fma.rn.f64  %fd35, %fd13, %fd13, 0d0000000000000000;
    sub.f64     %fd36, %fd31, %fd35;
    sqrt.rn.f64     %fd14, %fd36;
    st.local.f64    [%rd1+40], %fd14;
    mul.f64     %fd15, %fd32, %fd34;
    st.local.f64    [%rd1+64], %fd15;
    ld.local.f64    %fd37, [%rd1+32];
    fma.rn.f64  %fd38, %fd15, %fd37, 0d0000000000000000;
    sub.f64     %fd39, %fd33, %fd38;
    rcp.rn.f64  %fd40, %fd14;
    mul.f64     %fd16, %fd39, %fd40;
    st.local.f64    [%rd1+72], %fd16;
    mov.f64     %fd58, 0d0000000000000000;
    mov.u32     %r58, -2;
    mov.u64     %rd40, -8;

BB1_5:
    shl.b64     %rd23, %rd40, 3;
    sub.s64     %rd24, %rd1, %rd23;
    ld.local.f64    %fd41, [%rd24];
    fma.rn.f64  %fd58, %fd41, %fd41, %fd58;
    add.s64     %rd40, %rd40, -1;
    add.s32     %r58, %r58, 1;
    setp.ne.s32 %p3, %r58, 0;
    @%p3 bra    BB1_5;

    sub.f64     %fd43, %fd6, %fd58;
    sqrt.rn.f64     %fd19, %fd43;
    st.local.f64    [%rd1+80], %fd19;
    mul.f64     %fd20, %fd8, %fd34;
    st.local.f64    [%rd1+96], %fd20;
    ld.local.f64    %fd45, [%rd1+32];
    fma.rn.f64  %fd46, %fd20, %fd45, 0d0000000000000000;
    sub.f64     %fd47, %fd9, %fd46;
    mul.f64     %fd21, %fd47, %fd40;
    st.local.f64    [%rd1+104], %fd21;
    mov.f64     %fd59, 0d0000000000000000;
    mov.u32     %r59, -2;
    mov.u64     %rd41, %rd1;

BB1_7:
    mov.u64     %rd5, %rd41;
    ld.local.f64    %fd49, [%rd5+64];
    ld.local.f64    %fd50, [%rd5+96];
    fma.rn.f64  %fd59, %fd50, %fd49, %fd59;
    add.s64     %rd6, %rd5, 8;
    add.s32     %r59, %r59, 1;
    setp.ne.s32 %p4, %r59, 0;
    mov.u64     %rd41, %rd6;
    @%p4 bra    BB1_7;

    sub.f64     %fd52, %fd10, %fd59;
    rcp.rn.f64  %fd53, %fd19;
    mul.f64     %fd24, %fd52, %fd53;
    st.local.f64    [%rd1+112], %fd24;
    mov.f64     %fd60, 0d0000000000000000;
    mov.u32     %r60, -3;
    mov.u64     %rd42, -12;

BB1_9:
    shl.b64     %rd26, %rd42, 3;
    sub.s64     %rd27, %rd1, %rd26;
    ld.local.f64    %fd54, [%rd27];
    fma.rn.f64  %fd60, %fd54, %fd54, %fd60;
    add.s64     %rd42, %rd42, -1;
    add.s32     %r60, %r60, 1;
    setp.ne.s32 %p5, %r60, 0;
    @%p5 bra    BB1_9;
Run Code Online (Sandbox Code Playgroud)

显然循环没有展开并且使用了昂贵的本地内存(我的目标是只在寄存器中做所有事情)。

我像这样调用函数:

    T l[N*N];
    for(int i = 0; i < N*N; ++i){
        l[i] = buffer[offset+i];
    }

    choleskyKernel2<T,N>(l);

    for(int i = 0; i < N*N; ++i){
        buffer[offset+i] =  l[i];
    }
Run Code Online (Sandbox Code Playgroud)

有没有办法正确展开这个循环,以便一切都可以在寄存器中完成?

编辑:

完整代码:

#include <thrust/device_vector.h>

template<typename T, int N>
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {
            double s = 0;
#pragma unroll
            for (int k = 0; k < j; k++){
                s += C[i*N+k] * C[j*N+k];
            }
            s = C[i*N+j] - s;
            C[i*N+j] = (i == j) ?
                        sqrt(s) :
                        (1.0 / C[j*N+j] * (s));
        }
    }
}

template<typename T, int N>
__global__ static
void test3(T* buffer){
    const int matrixElements = N * N;

    T l[matrixElements];
    for(int i = 0; i < matrixElements; ++i){
        l[i] = buffer[i];
    }

    choleskyKernel2<T,N>(l);

    for(int i = 0; i < matrixElements; ++i){
        buffer[i] =  l[i];
    }
}

int main(){
     thrust::device_vector<double> d_data(16);
     test3<double,4> <<< 1,1 >>>(thrust::raw_pointer_cast(d_data.data()));
}
Run Code Online (Sandbox Code Playgroud)

ter*_*era 6

While I can't tell you why nvcc (or indeed cicc which performs the device code compilation on behalf of nvcc) doesn't unroll your loops, I can show you how to change the code so that it does.

Turn

#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {
Run Code Online (Sandbox Code Playgroud)

into

#pragma unroll
    for (int i = 0; i < N; i++) {
#pragma unroll
        for (int j = 0; j < N; j++)
            if (j <= i) {
Run Code Online (Sandbox Code Playgroud)

and you will find all loops unrolled with no use of any local memory.

This is even though you didn't ask for the load and store loops to be unrolled. In fact, with my change above you don't need any #pragma unroll directives at all.