使用 SIMD (ARM) 的快速位矩阵 (64x64) 转置算法

sou*_*wal 5 assembly transpose arm simd neon

我想了解是否有一种快速方法可以使用 ARM SIMD 指令进行矩阵转置(64x64 位)。

我尝试探索ARM SIMD的VTRN指令,但不确定它在这种情况下的有效应用。

输入矩阵表示为 uint64 mat[64],输出应该是按位转置。

例如,如果输入是:

0000....
1111....
0000....
1111....
Run Code Online (Sandbox Code Playgroud)

预期输出:

0101....
0101....
0101....
0101....
Run Code Online (Sandbox Code Playgroud)

fuz*_*fuz 6

矩阵转置的基本递归方案是将矩阵表示为分块矩阵

\n
AB\nCD\n
Run Code Online (Sandbox Code Playgroud)\n

您可以通过首先转置 A、B、C 和 D 中的每一个,然后交换 B 和 C 来转置。在实践中,这意味着应用一系列越来越粗略的 swizzle 步骤,首先使用按位运算,然后使用排列运算。

\n

例如,这可以按如下方式实现:

\n
    # transpose a 64x64 bit matrix held in x0\n    GLOBL(xpose_asm)\nFUNC(xpose_asm)\n    # plan of attack: use registers v16--v32 to hold\n    # half the array, v0--v7 for scratch.  First transpose\n    # the two array halves individually, then swap the\n    # second and third quarters.\n    mov x4, lr\n\n    mov x2, x0\n    bl  NAME(xpose_half)\n    mov x3, x0\n    bl  NAME(xpose_half)\n\n    # final step: transpose 64x64 bit matrices\n    # we have to do this one in two parts as to not run\n    # out of registers\n    mov x5, x2\n    mov x6, x3\n    bl  NAME(xpose_final)\n    bl  NAME(xpose_final)\n\n    ret x4\nENDFUNC(xpose_asm)\n\n    # Transpose half a 32x64 bit matrix held in x0.\n    # On return, advance x0 by 32*8 = 256 byte.\nFUNC(xpose_half)\n    # v16 holds rows 0 and 4, v17 holds 1 and 5, and so on\n    mov x1, x0\n    ld4 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64\n    ld4 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64\n    ld4 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0], #64\n    ld4 {v28.2d, v29.2d, v30.2d, v31.2d}, [x0], #64\n\n    # macro for a transposition step.  Trashes v6 and v7\n.macro  xpstep lo, hi, mask, shift\n    ushr v6.2d, \\lo\\().2d, #\\shift\n    shl v7.2d, \\hi\\().2d, #\\shift\n    bif \\lo\\().16b, v7.16b, \\mask\\().16b\n    bit \\hi\\().16b, v6.16b, \\mask\\().16b\n.endm\n\n    # 1st step: transpose 2x2 bit matrices\n    movi    v0.16b, #0x55\n    xpstep  v16, v17, v0, 1\n    xpstep  v18, v19, v0, 1\n    xpstep  v20, v21, v0, 1\n    xpstep  v22, v23, v0, 1\n    xpstep  v24, v25, v0, 1\n    xpstep  v26, v27, v0, 1\n    xpstep  v28, v29, v0, 1\n    xpstep  v30, v31, v0, 1\n\n    # 2nd step: transpose 4x4 bit matrices\n    movi    v0.16b, #0x33\n    xpstep  v16, v18, v0, 2\n    xpstep  v17, v19, v0, 2\n    xpstep  v20, v22, v0, 2\n    xpstep  v21, v23, v0, 2\n    xpstep  v24, v26, v0, 2\n    xpstep  v25, v27, v0, 2\n    xpstep  v28, v30, v0, 2\n    xpstep  v29, v31, v0, 2\n\n    # immediate step: zip vectors to change\n    # colocation.  As a side effect, every other\n    # vector is temporarily relocated to the v0..v7\n    # register range\n    zip1    v0.2d,  v16.2d, v17.2d\n    zip2    v17.2d, v16.2d, v17.2d\n    zip1    v1.2d,  v18.2d, v19.2d\n    zip2    v19.2d, v18.2d, v19.2d\n    zip1    v2.2d,  v20.2d, v21.2d\n    zip2    v21.2d, v20.2d, v21.2d\n    zip1    v3.2d,  v22.2d, v23.2d\n    zip2    v23.2d, v22.2d, v23.2d\n    zip1    v4.2d,  v24.2d, v25.2d\n    zip2    v25.2d, v24.2d, v25.2d\n    zip1    v5.2d,  v26.2d, v27.2d\n    zip2    v27.2d, v26.2d, v27.2d\n    zip1    v6.2d,  v28.2d, v29.2d\n    zip2    v29.2d, v28.2d, v29.2d\n    zip1    v7.2d,  v30.2d, v31.2d\n    zip2    v31.2d, v30.2d, v31.2d\n\n    # macro for the 3rd transposition step\n    # swap low 4 bit of each hi member with\n    # high 4 bit of each orig member.  The orig\n    # members are copied to lo in the process.\n.macro  xpstep3 lo, hi, orig\n    mov \\lo\\().16b, \\orig\\().16b\n    sli \\lo\\().16b, \\hi\\().16b, #4\n    sri \\hi\\().16b, \\orig\\().16b, #4\n.endm\n\n    # 3rd step: transpose 8x8 bit matrices\n    # special code is needed here since we need to\n    # swap row n row line n+4, but these rows are\n    # always colocated in the same register\n    xpstep3 v16, v17, v0\n    xpstep3 v18, v19, v1\n    xpstep3 v20, v21, v2\n    xpstep3 v22, v23, v3\n    xpstep3 v24, v25, v4\n    xpstep3 v26, v27, v5\n    xpstep3 v28, v29, v6\n    xpstep3 v30, v31, v7\n\n    # registers now hold\n    # v16: { 0,  1}  v17: { 4,  5}  v18: { 2,  3}  v19: { 6,  7}\n    # v20: { 8,  9}  v21: {12, 13}  v22: {10, 11}  v23: {14, 15}\n    # v24: {16, 17}  v25: {20, 21}  v26: {18, 19}  v27: {22, 23}\n    # v28: {24, 25}  v29: {28, 29}  v30: {26, 27}  v31: {30, 31}\n\n    # 4th step: transpose 16x16 bit matrices\n    # this step again moves half the registers to v0--v7\n    trn1    v0.16b,  v16.16b, v20.16b\n    trn2    v20.16b, v16.16b, v20.16b\n    trn1    v1.16b,  v17.16b, v21.16b\n    trn2    v21.16b, v17.16b, v21.16b\n    trn1    v2.16b,  v18.16b, v22.16b\n    trn2    v22.16b, v18.16b, v22.16b\n    trn1    v3.16b,  v19.16b, v23.16b\n    trn2    v23.16b, v19.16b, v23.16b\n    trn1    v4.16b,  v24.16b, v28.16b\n    trn2    v28.16b, v24.16b, v28.16b\n    trn1    v5.16b,  v25.16b, v29.16b\n    trn2    v29.16b, v25.16b, v29.16b\n    trn1    v6.16b,  v26.16b, v30.16b\n    trn2    v30.16b, v26.16b, v30.16b\n    trn1    v7.16b,  v27.16b, v31.16b\n    trn2    v31.16b, v27.16b, v31.16b\n\n    # 5th step: transpose 32x32 bit matrices\n    # while we are at it, shuffle the order of\n    # entries such that they are in order\n    trn1    v16.8h, v0.8h, v4.8h\n    trn2    v24.8h, v0.8h, v4.8h\n    trn1    v18.8h, v1.8h, v5.8h\n    trn2    v26.8h, v1.8h, v5.8h\n    trn1    v17.8h, v2.8h, v6.8h\n    trn2    v25.8h, v2.8h, v6.8h\n    trn1    v19.8h, v3.8h, v7.8h\n    trn2    v27.8h, v3.8h, v7.8h\n\n    trn1    v0.8h, v20.8h, v28.8h\n    trn2    v4.8h, v20.8h, v28.8h\n    trn1    v2.8h, v21.8h, v29.8h\n    trn2    v6.8h, v21.8h, v29.8h\n    trn1    v1.8h, v22.8h, v30.8h\n    trn2    v5.8h, v22.8h, v30.8h\n    trn1    v3.8h, v23.8h, v31.8h\n    trn2    v7.8h, v23.8h, v31.8h\n\n    # now deposit the partially transposed matrix\n    st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x1], #64\n    st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #64\n    st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x1], #64\n    st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #64\n\n    ret\nENDFUNC(xpose_half)\n\nFUNC(xpose_final)\n    ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64\n    ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x3], #64\n    ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x2], #64\n    ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [x3], #64\n\n    trn1    v0.4s, v16.4s, v24.4s\n    trn2    v4.4s, v16.4s, v24.4s\n    trn1    v1.4s, v17.4s, v25.4s\n    trn2    v5.4s, v17.4s, v25.4s\n    trn1    v2.4s, v18.4s, v26.4s\n    trn2    v6.4s, v18.4s, v26.4s\n    trn1    v3.4s, v19.4s, v27.4s\n    trn2    v7.4s, v19.4s, v27.4s\n\n    trn1    v16.4s, v20.4s, v28.4s\n    trn2    v24.4s, v20.4s, v28.4s\n    trn1    v17.4s, v21.4s, v29.4s\n    trn2    v25.4s, v21.4s, v29.4s\n    trn1    v18.4s, v22.4s, v30.4s\n    trn2    v26.4s, v22.4s, v30.4s\n    trn1    v19.4s, v23.4s, v31.4s\n    trn2    v27.4s, v23.4s, v31.4s\n\n    st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64\n    st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x6], #64\n    st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x5], #64\n    st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x6], #64\n\n    ret\nENDFUNC(xpose_final)\n
Run Code Online (Sandbox Code Playgroud)\n

我们可以看到,其性能与Lee 的方法相当,大约快了三倍。

\n
# Apple M1\nname  time/op\nRef      764ns \xc2\xb1 0%\nLee      102ns \xc2\xb1 0%\nFuz     34.7ns \xc2\xb1 0%\n\nname  speed\nRef    670MB/s \xc2\xb1 0%\nLee   5.01GB/s \xc2\xb1 0%\nFuz   14.7GB/s \xc2\xb1 0%\n\n\n# Kunpeng 920\nname  time/op\nRef     3.73\xc2\xb5s \xc2\xb1 0%\nLee      391ns \xc2\xb1 1%\nFuz     96.0ns \xc2\xb1 0%\n\nname  speed\nRef    137MB/s \xc2\xb1 0%\nLee   1.31GB/s \xc2\xb1 1%\nFuz   5.33GB/s \xc2\xb1 0%\n\n\n# ARM Cortex A72\nname  time/op\nRef     8.13\xc2\xb5s \xc2\xb1 0%\nLee      892ns \xc2\xb1 0%\nFuz      296ns \xc2\xb1 0%\n\nname  speed\nRef   63.0MB/s \xc2\xb1 0%\nLee    574MB/s \xc2\xb1 0%\nFuz   1.73GB/s \xc2\xb1 0%\n\n\n# Cavium ThunderX\nname  time/op\nRef     19.7\xc2\xb5s \xc2\xb1 0%\nLee     1.15\xc2\xb5s \xc2\xb1 0%\nFuz      690ns \xc2\xb1 0%\n\nname  speed\nRef   25.9MB/s \xc2\xb1 0%\nLee    444MB/s \xc2\xb1 0%\nFuz    742MB/s \xc2\xb1 0%\n
Run Code Online (Sandbox Code Playgroud)\n

进一步的改进是可能的。例如,合适的排列掩码可以与tbl指令集一起使用以一次执行多个转置步骤(尤其是步骤3至5)。

\n

请注意,该算法只需加载和写出数组两次。一次转置每个 32x32 子数组(两次调用xpose_half),再一次将右上角与左下角的 32x32 子数组交换。在这两种情况下,都使用最大宽度 64 字节加载和存储,将内存操作量减少到最低限度。

\n