快速计算__m128i寄存器中的设置位数

enz*_*m83 11 c c++ sse simd sse2

我应该计算__m128i寄存器的设置位数.特别是,我应该使用以下方法编写两个能够计算寄存器位数的函数.

  1. 寄存器的设定位总数.
  2. 寄存器的每个字节的设置位数.

是否有可以完全或部分执行上述操作的内在功能?

Mar*_*han 24

以下是我在旧项目中使用的一些代码(有关于它的研究论文).popcnt8下面的函数计算每个字节中设置的位数.

SSE2-only版本(基于Hacker's Delight书中的算法3 ):

static const __m128i popcount_mask1 = _mm_set1_epi8(0x77);
static const __m128i popcount_mask2 = _mm_set1_epi8(0x0F);
static inline __m128i popcnt8(__m128i x) {
    __m128i n;
    // Count bits in each 4-bit field.
    n = _mm_srli_epi64(x, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    n = _mm_srli_epi64(n, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    n = _mm_srli_epi64(n, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    x = _mm_add_epi8(x, _mm_srli_epi16(x, 4));
    x = _mm_and_si128(popcount_mask2, x);
    return x;
}
Run Code Online (Sandbox Code Playgroud)

SSSE3版本(由于Wojciech Mula):

static const __m128i popcount_mask = _mm_set1_epi8(0x0F);
static const __m128i popcount_table = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
static inline __m128i popcnt8(__m128i n) {
    const __m128i pcnt0 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(n, popcount_mask));
    const __m128i pcnt1 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(_mm_srli_epi16(n, 4), popcount_mask));
    return _mm_add_epi8(pcnt0, pcnt1);
}
Run Code Online (Sandbox Code Playgroud)

XOP版本(相当于SSSE3,但使用XOP指令,这些指令在AMD Bulldozer上更快)

static const __m128i popcount_mask = _mm_set1_epi8(0x0F);
static const __m128i popcount_table = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
static const __m128i popcount_shift = _mm_set1_epi8(-4);
static inline __m128i popcount8(__m128i n) {
    const __m128i pcnt0 = _mm_perm_epi8(popcount_table, popcount_table, _mm_and_si128(n, popcount_mask));
    const __m128i pcnt1 = _mm_perm_epi8(popcount_table, popcount_table, _mm_shl_epi8(n, popcount_shift));
    return _mm_add_epi8(pcnt0, pcnt1);
}
Run Code Online (Sandbox Code Playgroud)

功能 popcnt64 below counts the number of bits in the low and high 64-bit parts of the SSE register:

SSE2版本:

static inline __m128i popcnt64(__m128i n) {
    const __m128i cnt8 = popcnt8(n);
    return _mm_sad_epu8(cnt8, _mm_setzero_si128());
}
Run Code Online (Sandbox Code Playgroud)

XOP版本:

static inline __m128i popcnt64(__m128i n) {
    const __m128i cnt8 = popcnt8(n);
    return _mm_haddq_epi8(cnt8);
}
Run Code Online (Sandbox Code Playgroud)

最后,popcnt128下面的函数计算整个128位寄存器中的位数:

static inline int popcnt128(__m128i n) {
    const __m128i cnt64 = popcnt64(n);
    const __m128i cnt64_hi = _mm_unpackhi_epi64(cnt64, cnt64);
    const __m128i cnt128 = _mm_add_epi32(cnt64, cnt64_hi);
    return _mm_cvtsi128_si32(cnt128);
}
Run Code Online (Sandbox Code Playgroud)

但是,更有效的实现方法popcnt128是使用硬件POPCNT指令(在支持它的处理器上):

static inline int popcnt128(__m128i n) {
    const __m128i n_hi = _mm_unpackhi_epi64(n, n);
    #ifdef _MSC_VER
        return __popcnt64(_mm_cvtsi128_si64(n)) + __popcnt64(_mm_cvtsi128_si64(n_hi));
    #else
        return __popcntq(_mm_cvtsi128_si64(n)) + __popcntq(_mm_cvtsi128_si64(n_hi));
    #endif
}
Run Code Online (Sandbox Code Playgroud)

  • 好像你是上述研究论文的共同作者之一:-)对于cut'n'paste工作人员的好总结.您的解决方案是最新的.哈克姆的技巧不再是最新的了.荣道,老兄! (2认同)
  • 哦,太糟糕了.你在ACM上发表了你的论文,所以我不能不花钱15美元阅读它:-( (2认同)