正确进行多个CUDA块同步的方法

Tia*_*ian 0 synchronization cuda block

我喜欢对多个块进行 CUDA 同步。它并不是对于 __syncthreads() 可以轻松处理的每个块。

我看到关于这个主题的现有讨论,例如cuda 块同步,我喜欢 @johan 提出的简单解决方案, https: //stackoverflow.com/a/67252761/3188690,本质上它使用 64 位计数器跟踪同步块。

但是,我编写了以下代码试图完成类似的工作,但遇到了问题。这里我使用这个术语environment是为了使wkNumberEnvs这个环境中的块能够同步。它有一个柜台。我曾经atomicAdd()计算有多少块已经自己同步,一旦同步块的数量 == wkBlocksPerEnv,我就知道所有块都完成了同步,并且可以自由运行。然而,它有一个奇怪的结果,我不确定为什么。

问题就出在这个while循环上。由于所有块的第一个线程都在执行atomicAdd,因此有一个while循环来检查,直到条件满足。但我发现有些块会陷入无限循环,我不确定为什么最终无法满足条件?如果我在 或 中打印一些消息*** I can print here 1*** I can print here 2则不会出现无限循环,一切都很完美。我没有看到明显的东西。

const int wkBlocksPerEnv = 2;

__device__ int env_sync_block_count[wkNumberEnvs];

__device__ void syncthreads_for_env(){
    // sync threads for each block so all threads in this block finished the previous tasks
    __syncthreads();

    // sync threads for wkBlocksPerEnv blocks for each environment
    if(wkBlocksPerEnv > 1){
       const int kThisEnvId = get_env_scope_block_id(blockIdx.x);

       if (threadIdx.x == 0){
            // incrementing env_sync_block_count by 1
            atomicAdd(&env_sync_block_count[kThisEnvId], 1);
            // *** I can print here 1
            while(env_sync_block_count[kThisEnvId] != wkBlocksPerEnv){
            // *** I can print here 2
            }

    // Do the next job ...
    }
}
Run Code Online (Sandbox Code Playgroud)

小智 5

您的代码存在两个潜在问题。缓存和块调度。

缓存可以防止您在 while 循环期间观察到更新的值。

如果等待尚未调度的块的更新,块调度可能会导致死锁。由于 CUDA 不保证调度块的特定顺序,因此防止这种死锁的唯一方法是限制网格中的块数量,以便所有块可以同时运行。

以下代码显示了如何同步多个块,同时避免上述问题。我改编了 CUDA 示例中给出的多网格同步的代码conjugateGradientMultiDeviceCG https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu#L186

在 Volta 之前的设备上,它使用易失性内存访问。Volta 及更高版本使用获取/释放语义。网格大小通过查询设备属性来限制。


#include <cassert>
#include <cstdio>

constexpr int wkBlocksPerEnv = 13;

__device__
int getEnv(int blockId){
    return blockId / wkBlocksPerEnv;
}

__device__
int getRankInEnv(int blockId){
    return blockId % wkBlocksPerEnv;
}

__device__ 
unsigned char load_arrived(unsigned char *arrived) {
#if __CUDA_ARCH__ < 700
    return *(volatile unsigned char *)arrived;
#else
    unsigned int result;
    asm volatile("ld.acquire.gpu.global.u8 %0, [%1];"
                 : "=r"(result)
                 : "l"(arrived)
                 : "memory");
    return result;
#endif
  }

__device__ 
void store_arrived(unsigned char *arrived,
                                unsigned char val) {
#if __CUDA_ARCH__ < 700
    *(volatile unsigned char *)arrived = val;
#else
    unsigned int reg_val = val;
    asm volatile(
        "st.release.gpu.global.u8 [%1], %0;" ::"r"(reg_val) "l"(arrived)
        : "memory");

    // Avoids compiler warnings from unused variable val.
    (void)(reg_val = reg_val);
#endif
  }

#if 0
//wrong implementation which does not synchronize. to check that kernel assert does trigger without proper synchronization
__device__ 
void syncthreads_for_env(unsigned char* temp){

}
#else
//temp must have at least size sizeof(unsigned char) * total_number_of_blocks in grid
__device__ 
void syncthreads_for_env(unsigned char* temp){
    __syncthreads();
    const int env = getEnv(blockIdx.x);
    const int blockInEnv = getRankInEnv(blockIdx.x);
    unsigned char* const mytemp = temp + env * wkBlocksPerEnv;

    if(threadIdx.x == 0){
        if(blockInEnv == 0){
            // Leader block waits for others to join and then releases them.
            // Other blocks in env can arrive in any order, so the leader have to wait for
            // all others.
            for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
                while (load_arrived(&mytemp[i]) == 0)
                    ;
            }
            for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
                store_arrived(&mytemp[i], 0);
            }
            __threadfence();
        }else{
            // Other blocks in env note their arrival and wait to be released.
            store_arrived(&mytemp[blockInEnv - 1], 1);
            while (load_arrived(&mytemp[blockInEnv - 1]) == 1)
                ;
        }
    }

    __syncthreads();
}
#endif

__global__
void kernel(unsigned char* synctemp, int* array){
    const int env = getEnv(blockIdx.x);
    const int blockInEnv = getRankInEnv(blockIdx.x);

    if(threadIdx.x == 0){
        array[blockIdx.x] = 1;
    }

    syncthreads_for_env(synctemp);
    
    if(threadIdx.x == 0){
        int sum = 0;
        for(int i = 0; i < wkBlocksPerEnv; i++){
            sum += array[env * wkBlocksPerEnv + i];
        }
        assert(sum == wkBlocksPerEnv);
    }
}


int main(){
    
    const int smem = 0;
    const int blocksize = 128;

    int deviceId = 0;
    int numSMs = 0;
    int maxBlocksPerSM = 0;

    cudaGetDevice(&deviceId);
    cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId);
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &maxBlocksPerSM,
        kernel,
        blocksize, 
        smem
    );

    int maxBlocks = maxBlocksPerSM * numSMs;
    maxBlocks -= maxBlocks % wkBlocksPerEnv; //round down to nearest multiple of wkBlocksPerEnv
    printf("wkBlocksPerEnv %d, maxBlocks: %d\n", wkBlocksPerEnv, maxBlocks);

    int* d_array;
    unsigned char* d_synctemp;
    cudaMalloc(&d_array, sizeof(int) * maxBlocks);

    cudaMalloc(&d_synctemp, sizeof(unsigned char) * maxBlocks);
    cudaMemset(d_synctemp, 0, sizeof(unsigned char) * maxBlocks);

    kernel<<<maxBlocks, blocksize>>>(d_synctemp, d_array);

    cudaFree(d_synctemp);
    cudaFree(d_array);

    return 0;
}
Run Code Online (Sandbox Code Playgroud)