CUDA 9 shfl vs. shfl_sync

dar*_*ari 5 cuda

自CUDA 9起,shfl指令已弃用,应由shfl_sync替换.

但是,当他们表现不同时,我该如何更换它们呢?

代码示例:

__global__
static void shflTest(){
    int tid = threadIdx.x;
    float value = tid + 0.1f;
    int* ivalue = reinterpret_cast<int*>(&value);

    //use the integer shfl
    int ix = __shfl(ivalue[0],5,32);
    int iy = __shfl_sync(ivalue[0],5,32);

    float x = reinterpret_cast<float*>(&ix)[0];
    float y = reinterpret_cast<float*>(&iy)[0];

    if(tid == 0){
        printf("shfl tmp %d %d\n",ix,iy);
        printf("shfl final %f %f\n",x,y);
    }
}

int main()
{
    shflTest<<<1,32>>>();
    cudaDeviceSynchronize();
    return 0;
}
Run Code Online (Sandbox Code Playgroud)

输出:

shfl tmp 1084437299 5
shfl final 5.100000 0.000000
Run Code Online (Sandbox Code Playgroud)

Rob*_*lla 11

如果您阅读了CUDA 9RC编程指南(B.15节),随着您的CUDA 9RC副本一起安装,您将看到新__shfl_sync()功能有一个额外的mask参数,您不会考虑:

CUDA 8:

int __shfl(int var, int srcLane, int width=warpSize);
Run Code Online (Sandbox Code Playgroud)

CUDA 9:

T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);
              ^^^^^^^^^^^^^
Run Code Online (Sandbox Code Playgroud)

还指示了对此掩码参数的期望:

新的*_sync shfl内在函数接受一个掩码,指示参与调用的线程.必须为每个参与线程设置一个表示线程的通道ID的位,以确保它们在硬件执行内部函数之前正确收敛.掩码中命名的所有未退出线程必须使用相同的掩码执行相同的内部函数,否则结果是未定义的.

因此,如果我们修改您的代码以符合这一点,我们会得到预期的结果:

$ cat t419.cu
#include <stdio.h>

__global__
static void shflTest(int lid){
    int tid = threadIdx.x;
    float value = tid + 0.1f;
    int* ivalue = reinterpret_cast<int*>(&value);

    //use the integer shfl
    int ix = __shfl(ivalue[0],5,32);
    int iy = __shfl_sync(0xFFFFFFFF, ivalue[0],5,32);

    float x = reinterpret_cast<float*>(&ix)[0];
    float y = reinterpret_cast<float*>(&iy)[0];

    if(tid == lid){
        printf("shfl tmp %d %d\n",ix,iy);
        printf("shfl final %f %f\n",x,y);
    }
}

int main()
{
    shflTest<<<1,32>>>(0);
    cudaDeviceSynchronize();
    return 0;
}
$ nvcc -arch=sm_61 -o t419 t419.cu
t419.cu(10): warning: function "__shfl(int, int, int)"
/usr/local/cuda/bin/..//include/sm_30_intrinsics.hpp(152): here was declared deprecated ("__shfl() is deprecated in favor of __shfl_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning).")

$ cuda-memcheck ./t419
========= CUDA-MEMCHECK
shfl tmp 1084437299 1084437299
shfl final 5.100000 5.100000
========= ERROR SUMMARY: 0 errors
$
Run Code Online (Sandbox Code Playgroud)

  • 使用`shfl_xx_sync(0xFFFFFFFF,...)`替换所有`shfl_xx(...)`是否可以保存,即使宽度不是32? (4认同)