使用SIMD将10位值打包到字节流中

Mar*_*ata 5 c++ x86 bit-manipulation simd

我正在尝试使用SIMD指令将10位像素打包到连续的字节流中.下面的代码"原则上",但SIMD版本比标量版本慢.

问题似乎是我无法找到有效加载寄存器的良好聚集/分散操作.

有任何改进建议吗?

// SIMD_test.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#include "Windows.h"
#include <tmmintrin.h>
#include <stdint.h>
#include <string.h>

// reference non-SIMD implementation that "works"
// 4 uint16 at a time as input, and 5 uint8 as output per loop iteration

void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    for(uint32_t j=0;j<NCOL;j+=4)
    {
        streamBuffer[0] = (uint8_t)(ptr[0]);
        streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2));
        streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4));
        streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6));
        streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ;
        streamBuffer += 5;
        ptr += 4;
    }
}


// poorly written SIMD implementation. Attempts to do the same
// as the packSlow, but 8 iterations at a time

void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
    const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F);
    const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F);
    const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03);

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        _mm_prefetch((const char*)(ptr+j),_MM_HINT_T0);
    }

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        // this "fetch" stage is costly. Each term takes 2 cycles
        __m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]);
        __m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]);
        __m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]);
        __m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]);

           // I think this part is fairly well optimized
        __m128i streamBuffer0 =  ptr0;
        __m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2)));
        __m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4)));
        __m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6)));
        __m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ;

        // this again is terribly slow. ~2 cycles per byte output
        for(int j=15;j>=0;j-=2)
        {
            streamBuffer[0] = streamBuffer0.m128i_u8[j];
            streamBuffer[1] = streamBuffer1.m128i_u8[j];
            streamBuffer[2] = streamBuffer2.m128i_u8[j];
            streamBuffer[3] = streamBuffer3.m128i_u8[j];
            streamBuffer[4] = streamBuffer4.m128i_u8[j];
            streamBuffer += 5;
        }
        ptr += 32;
    }

}

int _tmain(int argc, _TCHAR* argv[])
{

    uint16_t pixels[512];
    uint8_t packed1[512*10/8];
    uint8_t packed2[512*10/8];

    for(int i=0;i<512;i++)
    {
        pixels[i] = i;
    }

    LARGE_INTEGER t0,t1,t2;

    QueryPerformanceCounter(&t0);
    for(int k=0;k<1000;k++) packSlow(pixels,packed1,512);
    QueryPerformanceCounter(&t1);
    for(int k=0;k<1000;k++) packFast(pixels,packed2,512);
    QueryPerformanceCounter(&t2);

    printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart);

    if (memcmp(packed1,packed2,sizeof(packed1)))
    {
        printf("failed\n");
    }


    return 0;
}
Run Code Online (Sandbox Code Playgroud)

Jef*_*eff 3

重新阅读您的代码时,看起来您几乎肯定正在谋杀您的加载/存储单元,而新的 AVX2 指令系列甚至无法完全缓解这一问题VGATHER[D/Q]P[D/S]。即使 Haswell 的架构仍然需要每个加载元素一个 uop,每个加载元素都会命中 L1D TLB 和缓存,而不管位置如何,Skylake 中显示出效率的提高。最快2016年。

目前您最好的办法可能是执行 16B 寄存器读取,并streamBuffer使用寄存器副本、_mm_shuffle_epi8()_mm_or_si128()调用以及整理存储的逆操作手动构造您的值。

在不久的将来,AVX2 将提供(并且已经为较新的桌面提供)VPS[LL/RL/RA]V[D/Q]指令,允许变量元素移动,与水平添加相结合,可以非常快速地完成此打包。在这种情况下,您可以使用简单的MOVDQU指令来加载值,因为您可以uint16_t在单个 xmm 寄存器中处理连续的输入值。

另外,请考虑重新进行预取。您的jinNCOL循环一次处理 64B/1 缓存行,因此您应该ptr + 32在第二个循环主体的开头执行一次预取。您甚至可以考虑忽略它,因为这是一个简单的前向扫描,无论如何,硬件预取器都会在极少量的迭代后为您检测并自动化。