在CUDA中使用SIMD实现位旋转运算符

Question

在CUDA中使用SIMD实现位旋转运算符

Sac*_*aki 6 c++ cuda simd bitwise-operators

我知道StackOverflow不是为了向其他人询问代码,而是让我说话.

我正在尝试在CUDA C++设备代码中实现一些AES函数.在尝试实现左侧按字节旋转运算符时,我感到不安的是看到没有原生的SIMD内向.所以我开始了一个天真的实现,但......它是巨大的,虽然我还没有尝试过,但由于昂贵的拆包/打包,它不会很快......所以,有什么意思吗每字节位旋转操作至少有些效率？

如果您不想看看,这是代码.

__inline__ __device__ uint32_t per_byte_bit_left_rotate(uint32_t input, uint8_t amount) {
return ((((input & 0xFF) >> 0) << amount) | (((input & 0xFF) >> 0) >> 7) & ~0x100) << 0 |
     ((((input & 0xFF00) >> 8) << amount) | ((input & 0xFF00 >> 8) >> 7) & ~0x100) << 8 |
     ((((input & 0xFF0000) >> 16) << amount) | ((input & 0xFF0000 >> 16) >> 7) & ~0x100) << 16 |
     ((((input & 0xFF000000) >> 24) << amount) | ((input & 0xFF000000 >> 24) >> 7) & ~0x100) << 24; } // The XORs are for clearing the old 7th bit who is getting pushed to the next byte of the intermediate int

Run Code Online (Sandbox Code Playgroud)

Answer 1

nju*_*ffa 6

CUDA具有__byte_perm()直接映射到PRMT机器代码(SASS)级别的指令的内在函数,该指令是字节式置换指令.它可用于有效地提取和合并字节.为了影响逐字节左旋转,我们可以将每个字节加倍,将字节对移位所需的量,然后提取并合并字节对的四个高字节.

对于按字节顺序旋转,我们只需要移位量的最低三位,因为旋转方向s与旋转方向相同s mod 8.为了提高效率,最好避免包含少于32位的整数类型,因为C++语义要求整数类型比在表达式中使用之前int要宽一些int.这可能并且确实会导致许多架构(包括GPU)上的转换开销.

PRMT指令的吞吐量取决于体系结构,因此使用__byte_perm()可能导致代码比使用另一个答案中演示的经典SIMD-in-a-register方法更快或更慢,因此请务必在您的上下文中进行基准测试.部署前的用例.

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

__device__ uint32_t per_byte_bit_left_rotate (uint32_t input, uint32_t amount)
{
     uint32_t l = __byte_perm (input, 0, 0x1100) << (amount & 7);
     uint32_t h = __byte_perm (input, 0, 0x3322) << (amount & 7);
     return __byte_perm (l, h, 0x7531);
}

__global__ void rotl_kernel (uint32_t input, uint32_t amount, uint32_t *res)
{
    *res = per_byte_bit_left_rotate (input, amount);
}

uint32_t ref_per_byte_bit_left_rotate (uint32_t input, uint32_t amount)
{
   int s = amount & 7;
   uint8_t b0 = (input >>  0) & 0xff;
   uint8_t b1 = (input >>  8) & 0xff;
   uint8_t b2 = (input >> 16) & 0xff;
   uint8_t b3 = (input >> 24) & 0xff;
   b0 = s ? ((b0 << s) | (b0 >> (8 - s))) : b0;
   b1 = s ? ((b1 << s) | (b1 >> (8 - s))) : b1;
   b2 = s ? ((b2 << s) | (b2 >> (8 - s))) : b2;
   b3 = s ? ((b3 << s) | (b3 >> (8 - s))) : b3;
   return (b3 << 24) | (b2 << 16) | (b1 << 8) | (b0 << 0);
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

int main (void)
{
    uint32_t arg, ref, res = 0, *res_d = 0;
    uint32_t shft;

    CUDA_SAFE_CALL (cudaMalloc ((void**)&res_d, sizeof(*res_d)));
    for (int i = 0; i < 100000; i++) {
        arg  = KISS;
        shft = KISS;
        ref = ref_per_byte_bit_left_rotate (arg, shft);
        rotl_kernel <<<1,1>>>(arg, shft, res_d);
        CHECK_LAUNCH_ERROR();
        CUDA_SAFE_CALL (cudaMemcpy (&res, res_d, sizeof (res), 
                                    cudaMemcpyDeviceToHost));
        if (res != ref) {
            printf ("!!!! arg=%08x shft=%d  res=%08x  ref=%08x\n", 
                    arg, shft, res, ref);
        }
    }
    CUDA_SAFE_CALL (cudaFree (res_d));
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    return EXIT_SUCCESS;
}

Run Code Online (Sandbox Code Playgroud)

Answer 2

Pet*_*des 4

所有元素的旋转计数都相同，对吧？

将整个输入左移和右移，然后对那些带有将所有跨越字节边界的位归零的掩码进行 AND 的操作，对于一次 AND 中的所有 4 个字节。我认为amount始终是 AES 中的编译时常量，因此您不必担心动态生成掩码的运行时成本。让编译器来做就可以了。（IDK CUDA，但这似乎与为普通 C++ 编写具有 32 位整数的SWAR 位黑客相同的问题）

这是基于通常的(x << count) | (x >> (32-count))循环习惯用法，通过掩码和不同的右移计数使其成为单独的 8 位循环。

inline
uint32_t per_byte_bit_left_rotate(uint32_t input, unsigned amount)
{
    // With constant amount, the left/right masks are constants
    uint32_t rmask = 0xFF >> ((8 - amount) & 7);
    rmask = (rmask<<24 | rmask<<16 | rmask<<8 | rmask);
    uint32_t lmask = ~rmask;

    uint32_t lshift = input << amount;
    lshift &= lmask;
    if (amount == 1) {  // special case left-shift by 1 using an in-lane add instead of shift&mask
        lshift = __vadd4(input, input);
    }
    uint32_t rshift = input >> ((8 - amount) & 7);
    rshift &= rmask;

    uint32_t rotated = lshift | rshift;
    return rotated;
}

Run Code Online (Sandbox Code Playgroud)

(in&lmask)<<amount | ((in>>(8-amount))&rmask)在移位之前以一种方式屏蔽输入，并在移位之后屏蔽输出（使用不同的 lmask）可能会更有效。NVidia 硬件是有序超标量，并且移位的吞吐量有限。这样做更有可能作为两个独立的移位+掩码对执行。

（这并不试图避免 C++ UB 的 amount>=32。请参阅C++ 中循环移位（旋转）操作的最佳实践。在这种情况下，我认为更改为lshift = input << (amount & 7)可以解决问题。

为了测试编译是否有效，我使用常量查看了 x86-64 的clang -O3asm 输出amount。Godbolt 编译器资源管理器具有适用于各种架构的编译器（尽管不是 CUDA），因此如果您可以比 x86 更轻松地阅读这些 asm 语言，请单击该链接并转到 ARM、MIPS 或 PowerPC。

uint32_t rol7(uint32_t a) {
    return per_byte_bit_left_rotate(a, 7);
}
    mov     eax, edi
    shl     eax, 7
    shr     edi
    and     eax, -2139062144   # 0x80808080
    and     edi, 2139062143    # 0x7F7F7F7F
    lea     eax, [rdi + rax]   # ADD = OR when no bits intersect
    ret

Run Code Online (Sandbox Code Playgroud)

完美，正是我所希望的。

几个测试用例：

uint32_t test_rol() {
    return per_byte_bit_left_rotate(0x02ffff04, 0);
}
    // yup, returns the input with count=0
    // return 0x2FFFF04


uint32_t test2_rol() {
    return per_byte_bit_left_rotate(0x02f73804, 4);
}
    // yup, swaps nibbles
    // return 0x207F8340

Run Code Online (Sandbox Code Playgroud)

这与使用 x86 SSE2 / AVX2 进行 8 位移位需要执行的操作相同，因为硬件支持的最小位移粒度是 16 位。

归档时间：	8 年，6 月前
查看次数：	1103 次
最近记录：	8 年，6 月前