Roy*_*oyi 5 x86 sse x86-64 simd vectorization
1)有没有办法使用具有以下特征的SSE3(无SSE4)有效地实现符号功能?
__m128. __m128以[-1.0f,0.0f,1.0f]作为其值 我尝试过这个,但它不起作用(虽然我认为它应该):
inputVal = _mm_set_ps(-0.5, 0.5, 0.0, 3.0);
comp1 = _mm_cmpgt_ps(_mm_setzero_ps(), inputVal);
comp2 = _mm_cmpgt_ps(inputVal, _mm_setzero_ps());
comp1 = _mm_castsi128_ps(_mm_castps_si128(comp1));
comp2 = _mm_castsi128_ps(_mm_castps_si128(comp2));
signVal = _mm_sub_ps(comp1, comp2);
Run Code Online (Sandbox Code Playgroud)
2)有没有办法创建"标志"功能(我不确定正确的名称).即,如果A > B结果将是1和0否则.结果应该是float-point(__m128),就像它的输入一样.
更新:Cory Nelson的回答似乎在这里有效:
__m128 greatherThanFlag = _mm_and_ps(_mm_cmpgt_ps(valA, valB), _mm_set1_ps(1.0f));
__m128 lessThanFlag = _mm_and_ps(_mm_cmplt_ps(valA, valB), _mm_set1_ps(1.0f));
Run Code Online (Sandbox Code Playgroud)
首先想到的可能是最简单的:
__m128 sign(__m128 x)
{
__m128 zero = _mm_setzero_ps();
__m128 positive = _mm_and_ps(_mm_cmpgt_ps(x, zero), _mm_set1_ps(1.0f));
__m128 negative = _mm_and_ps(_mm_cmplt_ps(x, zero), _mm_set1_ps(-1.0f));
return _mm_or_ps(positive, negative);
}
Run Code Online (Sandbox Code Playgroud)
或者,如果您错过了并且打算获得整数结果:
__m128i sign(__m128 x)
{
__m128 zero = _mm_setzero_ps();
__m128 positive = _mm_and_ps(_mm_cmpgt_ps(x, zero),
_mm_castsi128_ps(_mm_set1_epi32(1)));
__m128 negative = _mm_cmplt_ps(x, zero);
return _mm_castps_si128(_mm_or_ps(positive, negative));
}
Run Code Online (Sandbox Code Playgroud)
如果可以sgn(-0.0f)生成 的输出-0.0f而不是+0.0f,那么与@Cory Nelson 的版本相比,您可以节省一两条指令。请参阅下面的版本,该版本也传播 NaN。
x != 0.0fx到 that 中。// return -0.0 for x=-0.0, otherwise the same as Cory's (except for NaN which neither handle well)
__m128 sgn_fast(__m128 x)
{
__m128 negzero = _mm_set1_ps(-0.0f);
// using _mm_setzero_ps() here might actually be better without AVX, since xor-zeroing is as cheap as a copy but starts a new dependency chain
//__m128 nonzero = _mm_cmpneq_ps(x, negzero); // -0.0 == 0.0 in IEEE floating point
__m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());
__m128 x_signbit = _mm_and_ps(x, negzero);
__m128 zeroone = _mm_and_ps(nonzero, _mm_set1_ps(1.0f));
return _mm_or_ps(zeroone, x_signbit);
}
Run Code Online (Sandbox Code Playgroud)
当输入为 NaN 时,根据 NaN 的符号,我认为它返回 +/-1.0f。(因为_mm_cmpneq_ps()当 x 为 NaN 时为 true:请参阅指令上的表格CMPPD)。
如果没有 AVX,这比 Cory 的版本(在 Godbolt 编译器浏览器上使用 clang3.9 )少了两条指令。当内联到循环中时,存储器源操作数可以是寄存器源操作数。gcc 使用更多指令,执行单独的 MOVAPS 加载并将自身绘制到需要额外 MOVAPS 才能将返回值放入 xmm0 的角落。
xorps xmm1, xmm1
cmpneqps xmm1, xmm0
andps xmm0, xmmword ptr [rip + .LCPI0_0] # x_signbit
andps xmm1, xmmword ptr [rip + .LCPI0_1] # zeroone
orps xmm0, xmm1
Run Code Online (Sandbox Code Playgroud)
关键路径延迟为cmpneqps+ andps+ orps,例如在 Intel Haswell 上为 3+1+1 个周期。Cory 的版本需要cmpps并行运行两条指令才能实现该延迟,而这只有在 Skylake 上才可能实现。其他 CPU 会出现资源冲突,导致额外的延迟周期。
为了传播 NaN,使可能的输出为-1.0f、-/+0.0f、1.0f和NaN,我们可以利用全 1 位模式是 NaN 的事实。
_mm_cmpunord_ps(x,x)获得 NaN 掩码。(或者等效地,cmpneqps)or到结果上使其保持不变或强制其为 NaN。// return -0.0 for x=-0.0. Return -NaN for any NaN
__m128 sgn_fast_nanpropagating(__m128 x)
{
__m128 negzero = _mm_set1_ps(-0.0f);
__m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());
__m128 x_signbit = _mm_and_ps(x, negzero);
__m128 nanmask = _mm_cmpunord_ps(x,x);
__m128 x_sign_or_nan = _mm_or_ps(x_signbit, nanmask); // apply it here instead of to the final result for better ILP
__m128 zeroone = _mm_and_ps(nonzero, _mm_set1_ps(1.0f));
return _mm_or_ps(zeroone, x_sign_or_nan);
}
Run Code Online (Sandbox Code Playgroud)
这可以高效编译,并且几乎不会延长关键路径延迟。不过,在没有 AVX 的情况下复制寄存器确实需要更多 MOVAPS 指令。
您也许可以使用SSE4.1 BLENDVPS做一些有用的事情,但它并不是所有 CPU 上最有效的指令。也很难避免将负零视为非零。
如果您想要整数结果,可以使用 SSSE3_mm_sign_epi32(set1(1), x)获得 -1、0 或 1 输出。如果-0.0f -> -1太马虎,您可以通过与结果进行 AND 运算来解决这个问题_mm_cmpneq_ps(x, _mm_setzero_ps())
// returns -1 for x = -0.0f
__m128i sgn_verysloppy_int_ssse3(__m128 x) {
__m128i one = _mm_set1_epi32(1);
__m128i sign = _mm_sign_epi32(one, _mm_castps_si128(x));
return sign;
}
// correct results for all inputs
// NaN -> -1 or 1 according to its sign bit, never 0
__m128i sgn_int_ssse3(__m128 x) {
__m128i one = _mm_set1_epi32(1);
__m128i sign = _mm_sign_epi32(one, _mm_castps_si128(x));
__m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());
return _mm_and_si128(sign, _mm_castps_si128(nonzero));
}
Run Code Online (Sandbox Code Playgroud)