请告诉我,我自己也搞不懂:
这里我有__m128iSIMD 向量 - 每个 16 个字节都包含以下值:
1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 1
是否可以以某种方式变换该向量,以便删除所有 1,并且零的位置是该零向量中元素的编号。也就是说,像这样:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 1
1 4 6 10 12 14
Run Code Online (Sandbox Code Playgroud)
最后得到一个只有这些值的向量:
1 4 6 10 12 14
Run Code Online (Sandbox Code Playgroud)
什么样的逻辑才能得到这样的结果呢?应使用哪些 SIMD 指令?
PS:我刚刚开始学习SIMD - 所以我知道的不多。我不明白。
如果您有BMI2,请使用以下版本。
\n__m128i compressZeroIndices_bmi2( __m128i v )\n{\n const __m128i zero = _mm_setzero_si128();\n // Replace zeros with 0xFF\n v = _mm_cmpeq_epi8( v, zero );\n\n // Extract low/high pieces into scalar registers for PEXT instruction\n uint64_t low = (uint64_t)_mm_cvtsi128_si64( v );\n uint64_t high = (uint64_t)_mm_extract_epi64( v, 1 );\n\n // Count payload bytes in the complete vector\n v = _mm_sub_epi8( zero, v );\n v = _mm_sad_epu8( v, zero );\n v = _mm_add_epi64( v, _mm_srli_si128( v, 8 ) );\n v = _mm_shuffle_epi8( v, zero );\n // Make a mask vector filled with 0 for payload bytes, 0xFF for padding\n const __m128i identity = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );\n v = _mm_max_epu8( v, identity );\n __m128i mask = _mm_cmpeq_epi8( v, identity );\n\n // The following line requires C++/20\n // If you don\'t have it, use #ifdef _MSC_VER to switch between __popcnt64() and _popcnt64() intrinsics.\n uint64_t lowBits = std::popcount( low );\n // Use BMI2 to gather these indices\n low = _pext_u64( 0x0706050403020100ull, low );\n high = _pext_u64( 0x0F0E0D0C0B0A0908ull, high );\n\n // Merge payload into a vector\n v = _mm_cvtsi64_si128( low | ( high << lowBits ) );\n v = _mm_insert_epi64( v, high >> ( 64 - lowBits ), 1 );\n\n // Apply the mask to set unused elements to -1, enables pmovmskb + tzcnt to find the length\n return _mm_or_si128( v, mask );\n}\nRun Code Online (Sandbox Code Playgroud)\n这里\xe2\x80\x99是另一个没有BMI2的版本。在大多数 CPU 上可能速度较慢,但代码更简单,并且不使用任何标量指令。
\ninline __m128i sortStep( __m128i a, __m128i perm, __m128i blend )\n{\n // The min/max are independent and their throughput is 0.33-0.5 cycles,\n // so this whole function only takes 3 (AMD) or 4 (Intel) cycles to complete\n __m128i b = _mm_shuffle_epi8( a, perm );\n __m128i i = _mm_min_epu8( a, b );\n __m128i ax = _mm_max_epu8( a, b );\n return _mm_blendv_epi8( i, ax, blend );\n}\n\n__m128i compressZeroIndices( __m128i v )\n{\n // Replace zeros with 0-based indices, ones with 0xFF\n v = _mm_cmpgt_epi8( v, _mm_setzero_si128() );\n const __m128i identity = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );\n v = _mm_or_si128( v, identity );\n\n // Sort bytes in the vector with a network\n // https://demonstrations.wolfram.com/SortingNetworks/\n // Click the "transposition" algorithm on that demo\n const __m128i perm1 = _mm_setr_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );\n const __m128i blend1 = _mm_set1_epi16( (short)0xFF00 );\n const __m128i perm2 = _mm_setr_epi8( 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 15 );\n const __m128i blend2 = _mm_setr_epi8( 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0 );\n for( size_t i = 0; i < 8; i++ )\n {\n v = sortStep( v, perm1, blend1 );\n v = sortStep( v, perm2, blend2 );\n }\n return v;\n}\nRun Code Online (Sandbox Code Playgroud)\nPS 如果您想要输出向量的长度,请使用此函数:
\nuint32_t vectorLength( __m128i v )\n{\n uint32_t mask = (uint32_t)_mm_movemask_epi8( v );\n mask |= 0x10000;\n return _tzcnt_u32( mask );\n}\nRun Code Online (Sandbox Code Playgroud)\n