为什么CUDA共享内存初始化为零?

lik*_*ern 2 c cuda gpu gpgpu nvidia

如此共享内存阵列默认值问题中所述,共享内存未初始化,即可以包含任何值.

#include <stdio.h>

#define BLOCK_SIZE 512

__global__ void scan(float *input, float *output, int len) {
    __shared__ int data[BLOCK_SIZE];

    // DEBUG
    if (threadIdx.x == 0 && blockIdx.x == 0)
    {
        printf("Block Number: %d\n", blockIdx.x);
        for (int i = 0; i < BLOCK_SIZE; ++i)
        {
            printf("DATA[%d] = %d\n", i, data[i]);
        }
    }

}

int main(int argc, char ** argv) {
    dim3 block(BLOCK_SIZE, 1, 1);
    dim3 grid(10, 1, 1);
    scan<<<grid,block>>>(NULL, NULL, NULL);
    cudaDeviceSynchronize();
    return 0;
}
Run Code Online (Sandbox Code Playgroud)

但是为什么在这段代码中它不是真的,而且我不断地将共享内存归零?

DATA[0] = 0
DATA[1] = 0
DATA[2] = 0
DATA[3] = 0
DATA[4] = 0
DATA[5] = 0
DATA[6] = 0
...
Run Code Online (Sandbox Code Playgroud)

我使用发布调试模式进行了测试:"-O3 -arch = sm_20"," - O3 -arch = sm_30"和"-arch = sm_30".结果总是一样的.

Jac*_*ern 9

tl; dr:共享内存未初始化为0

我认为您初始化的共享内存猜想值得0怀疑.请尝试以下代码,这是对您的稍作修改.在这里,我调用内核两次并改变data数组的值.内核第一次启动时,"未初始化"的值data将是全部0.内核第二次启动时,"未初始化"的值data将与0s的不同.

我认为这取决于共享存储器是SRAM的事实,它表现出数据剩磁.

#include <stdio.h>

#define BLOCK_SIZE 32

__global__ void scan(float *input, float *output, int len) {

    __shared__ int data[BLOCK_SIZE];

    if (threadIdx.x == 0 && blockIdx.x == 0)
    {
        for (int i = 0; i < BLOCK_SIZE; ++i)
        {
            printf("DATA[%d] = %d\n", i, data[i]);
            data[i] = i;
        }

    }
}

int main(int argc, char ** argv) {
    dim3 block(BLOCK_SIZE, 1, 1);
    dim3 grid(10, 1, 1);
    scan<<<grid,block>>>(NULL, NULL, NULL);
    scan<<<grid,block>>>(NULL, NULL, NULL);
    cudaDeviceSynchronize();
    getchar();
    return 0;
}
Run Code Online (Sandbox Code Playgroud)