pla*_*cel 19 c++ floating-point sse simd avx
SSE2具有在单精度浮点数和32位整数之间转换向量的指令.
_mm_cvtps_epi32()_mm_cvtepi32_ps()但是没有双精度和64位整数的等价物.换句话说,他们失踪了:
_mm_cvtpd_epi64()_mm_cvtepi64_pd()似乎AVX也没有它们.
模拟这些内在函数的最有效方法是什么?
Mys*_*ial 30
如果您愿意偷工减料,_mm512_cvtpd_epi64只需两个说明即可完成转换:
_mm256_cvtpd_epi64.double <-> int64,您只关心范围内的值NaN.double <-> int64_t,您只关心范围内的值[-2^51, 2^51].double - > uint64_t
// Only works for inputs in the range: [0, 2^52)
__m128i double_to_uint64(__m128d x){
x = _mm_add_pd(x, _mm_set1_pd(0x0010000000000000));
return _mm_xor_si128(
_mm_castpd_si128(x),
_mm_castpd_si128(_mm_set1_pd(0x0010000000000000))
);
}
Run Code Online (Sandbox Code Playgroud)
double - > int64_t
// Only works for inputs in the range: [-2^51, 2^51]
__m128i double_to_int64(__m128d x){
x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
return _mm_sub_epi64(
_mm_castpd_si128(x),
_mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
);
}
Run Code Online (Sandbox Code Playgroud)
uint64_t - > double
// Only works for inputs in the range: [0, 2^52)
__m128d uint64_to_double(__m128i x){
x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
}
Run Code Online (Sandbox Code Playgroud)
int64_t - > double
// Only works for inputs in the range: [-2^51, 2^51]
__m128d int64_to_double(__m128i x){
x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
}
Run Code Online (Sandbox Code Playgroud)
舍入行为:
double <-> uint64_t转换,舍入在当前舍入模式之后正常工作.(通常是圆形到偶数)[0, 2^52)转换,舍入将遵循除截断之外的所有模式的当前舍入模式.如果当前的舍入模式是截断(向零舍入),它实际上将向负无穷大舍入.它是如何工作的?
尽管这个技巧只有2个指令,但它并不完全是不言自明的.
关键是要认识到,对于双精度浮点,范围内的值double -> uint64_t具有"二进制位置",恰好位于尾数的最低位.换句话说,如果将指数和符号位清零,则尾数将精确地成为整数表示.
要转换double -> int64_t的[2^52, 2^53),你加一个神奇的数字x是的浮点值double -> uint64_t.这将M进入"标准化"范围2^52并且方便地舍去小数部分位.
现在剩下的就是删除高12位.通过屏蔽它可以轻松完成.最快的方法是识别那些高12位与那些高位相同x.因此,我们可以简单地减去或者XOR,而不是引入额外的掩码常数[2^52, 2^53).XOR具有更高的吞吐量.
转换M只是与此过程相反.你加回指数位M.然后通过减去uint64_t -> double浮点来对数字进行非标准化.
有符号整数转换稍微复杂一些,因为您需要处理2的补码符号扩展.我将这些作为练习留给读者.
wim*_*wim 15
这个答案是关于64位整数到双倍转换,没有偷工减料.我们假设整数输入和双输出都是256位宽的AVX寄存器.考虑两种方法:
low + high * 2^32:正如对问题的评论中所建议的,使用int64_to_double_full_range4次和一些数据改组.不幸的是,两者mul和数据混洗指令都需要执行端口5.这限制了这种方法的性能.
add:我们可以使用两次Mysticial的快速转换方法,以获得完整的64位整数范围的精确转换.64位整数分为32位低位和32位高位,与此问题的答案类似:如何使用SSE执行uint32/float转换?.这些作品中的每一个都适合Mysticial的整数到双重转换.最后,高部分乘以2 ^ 32并添加到低部分.签名转换比无符号转换(fma)更复杂,因为uint64_to_double_full_range不存在.
码:
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>
__m256d int64_to_double_fast_precise(const __m256i v)
/* Optimized full range int64_t to double conversion */
/* Emulate _mm256_cvtepi64_pd() */
{
__m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 encoded as floating-point */
__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); /* 2^84 + 2^63 encoded as floating-point */
__m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); /* 2^84 + 2^63 + 2^52 encoded as floating-point */
__m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
__m256i v_lo = _mm256_blend_epi32(magic_i_lo, v, 0b01010101); /* Blend the 32 lowest significant bits of v with magic_int_lo */
__m256i v_hi = _mm256_srli_epi64(v, 32); /* Extract the 32 most significant bits of v */
v_hi = _mm256_xor_si256(v_hi, magic_i_hi32); /* Flip the msb of v_hi and blend with 0x45300000 */
__m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); /* Compute in double precision: */
__m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo)); /* (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition !! */
return result; /* With gcc use -O3, then -fno-associative-math is default. Do not use -Ofast, which enables -fassociative-math! */
/* With icc use -fp-model precise */
}
__m256d uint64_to_double_fast_precise(const __m256i v)
/* Optimized full range uint64_t to double conversion */
/* This code is essentially identical to Mysticial's solution. */
/* Emulate _mm256_cvtepu64_pd() */
{
__m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 encoded as floating-point */
__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000000000000); /* 2^84 encoded as floating-point */
__m256i magic_i_all = _mm256_set1_epi64x(0x4530000000100000); /* 2^84 + 2^52 encoded as floating-point */
__m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
__m256i v_lo = _mm256_blend_epi32(magic_i_lo, v, 0b01010101); /* Blend the 32 lowest significant bits of v with magic_int_lo */
__m256i v_hi = _mm256_srli_epi64(v, 32); /* Extract the 32 most significant bits of v */
v_hi = _mm256_xor_si256(v_hi, magic_i_hi32); /* Blend v_hi with 0x45300000 */
__m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); /* Compute in double precision: */
__m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo)); /* (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition !! */
return result; /* With gcc use -O3, then -fno-associative-math is default. Do not use -Ofast, which enables -fassociative-math! */
/* With icc use -fp-model precise */
}
int main(){
int i;
uint64_t j;
__m256i j_4;
__m256d v;
double x[4];
double x0, x1, a0, a1;
j = 0ull;
printf("\nAccurate int64_to_double\n");
for (i = 0; i < 260; i++){
j_4= _mm256_set_epi64x(0, 0, -j, j);
v = int64_to_double_fast_precise(j_4);
_mm256_storeu_pd(x,v);
x0 = x[0];
x1 = x[1];
a0 = _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(),j));
a1 = _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(),-j));
printf(" j =%21li v =%23.1f v=%23.1f -v=%23.1f -v=%23.1f d=%.1f d=%.1f\n", j, x0, a0, x1, a1, x0-a0, x1-a1);
j = j+(j>>2)-(j>>5)+1ull;
}
j = 0ull;
printf("\nAccurate uint64_to_double\n");
for (i = 0; i < 260; i++){
if (i==258){j=-1;}
if (i==259){j=-2;}
j_4= _mm256_set_epi64x(0, 0, -j, j);
v = uint64_to_double_fast_precise(j_4);
_mm256_storeu_pd(x,v);
x0 = x[0];
x1 = x[1];
a0 = (double)((uint64_t)j);
a1 = (double)((uint64_t)-j);
printf(" j =%21li v =%23.1f v=%23.1f -v=%23.1f -v=%23.1f d=%.1f d=%.1f\n", j, x0, a0, x1, a1, x0-a0, x1-a1);
j = j+(j>>2)-(j>>5)+1ull;
}
return 0;
}
Run Code Online (Sandbox Code Playgroud)
这些函数的实际性能可能取决于周围的代码和cpu生成.
在intel skylake i5 6500系统上,上述代码中的简单测试B,C,D和E的1e9转换(256位宽)的时序结果:
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>
/*
gcc -O3 -Wall -m64 -mfma -mavx2 -march=broadwell cvt_int_64_double.c
./a.out A
time ./a.out B
time ./a.out C
etc.
*/
inline __m256d uint64_to_double256(__m256i x){ /* Mysticial's fast uint64_to_double. Works for inputs in the range: [0, 2^52) */
x = _mm256_or_si256(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)));
return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
}
inline __m256d int64_to_double256(__m256i x){ /* Mysticial's fast int64_to_double. Works for inputs in the range: (-2^51, 2^51) */
x = _mm256_add_epi64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
}
__m256d int64_to_double_full_range(const __m256i v)
{
__m256i msk_lo =_mm256_set1_epi64x(0xFFFFFFFF);
__m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0); /* 2^32 */
__m256i v_lo = _mm256_and_si256(v,msk_lo); /* extract the 32 lowest significant bits of v */
__m256i v_hi = _mm256_srli_epi64(v,32); /* 32 most significant bits of v. srai_epi64 doesn't exist */
__m256i v_sign = _mm256_srai_epi32(v,32); /* broadcast sign bit to the 32 most significant bits */
v_hi = _mm256_blend_epi32(v_hi,v_sign,0b10101010); /* restore the correct sign of v_hi */
__m256d v_lo_dbl = int64_to_double256(v_lo); /* v_lo is within specified range of int64_to_double */
__m256d v_hi_dbl = int64_to_double256(v_hi); /* v_hi is within specified range of int64_to_double */
v_hi_dbl = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl); /* _mm256_mul_pd and _mm256_add_pd may compile to a single fma instruction */
return _mm256_add_pd(v_hi_dbl,v_lo_dbl); /* rounding occurs if the integer doesn't exist as a double */
}
__m256d int64_to_double_based_on_cvtsi2sd(const __m256i v)
{ __m128d zero = _mm_setzero_pd(); /* to avoid uninitialized variables in_mm_cvtsi64_sd */
__m128i v_lo = _mm256_castsi256_si128(v);
__m128i v_hi = _mm256_extracti128_si256(v,1);
__m128d v_0 = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_lo));
__m128d v_2 = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_hi));
__m128d v_1 = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_lo,1));
__m128d v_3 = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_hi,1));
__m128d v_01 = _mm_unpacklo_pd(v_0,v_1);
__m128d v_23 = _mm_unpacklo_pd(v_2,v_3);
__m256d v_dbl = _mm256_castpd128_pd256(v_01);
v_dbl = _mm256_insertf128_pd(v_dbl,v_23,1);
return v_dbl;
}
__m256d uint64_to_double_full_range(const __m256i v)
{
__m256i msk_lo =_mm256_set1_epi64x(0xFFFFFFFF);
__m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0); /* 2^32 */
__m256i v_lo = _mm256_and_si256(v,msk_lo); /* extract the 32 lowest significant bits of v */
__m256i v_hi = _mm256_srli_epi64(v,32); /* 32 most significant bits of v */
__m256d v_lo_dbl = uint64_to_double256(v_lo); /* v_lo is within specified range of uint64_to_double */
__m256d v_hi_dbl = uint64_to_double256(v_hi); /* v_hi is within specified range of uint64_to_double */
v_hi_dbl = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl);
return _mm256_add_pd(v_hi_dbl,v_lo_dbl); /* rounding may occur for inputs >2^52 */
}
int main(int argc, char **argv){
int i;
uint64_t j;
__m256i j_4, j_inc;
__m256d v, v_acc;
double x[4];
char test = argv[1][0];
if (test=='A'){ /* test the conversions for several integer values */
j = 1ull;
printf("\nint64_to_double_full_range\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
v = int64_to_double_full_range(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21li v =%23.1f -v=%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[1],x[2],x[3]);
j = j*7ull;
}
j = 1ull;
printf("\nint64_to_double_based_on_cvtsi2sd\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
v = int64_to_double_based_on_cvtsi2sd(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21li v =%23.1f -v=%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[1],x[2],x[3]);
j = j*7ull;
}
j = 1ull;
printf("\nuint64_to_double_full_range\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,j,j);
v = uint64_to_double_full_range(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21lu v =%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[2],x[3]);
j = j*7ull;
}
}
else{
j_4 = _mm256_set_epi64x(-123,-4004,-312313,-23412731);
j_inc = _mm256_set_epi64x(1,1,1,1);
v_acc = _mm256_setzero_pd();
switch(test){
case 'B' :{
printf("\nLatency int64_to_double_cvtsi2sd()\n"); /* simple test to get a rough idea of the latency of int64_to_double_cvtsi2sd() */
for (i = 0; i<1000000000; i++){
v =int64_to_double_based_on_cvtsi2sd(j_4);
j_4= _mm256_castpd_si256(v); /* cast without conversion, use output as an input in the next step */
}
_mm256_storeu_pd(x,v);
}
break;
case 'C' :{
printf("\nLatency int64_to_double_full_range()\n"); /* simple test to get a rough idea of the latency of int64_to_double_full_range() */
for (i = 0; i<1000000000; i++){
v = int64_to_double_full_range(j_4);
j_4= _mm256_castpd_si256(v);
}
_mm256_storeu_pd(x,v);
}
break;
case 'D' :{
printf("\nThroughput int64_to_double_cvtsi2sd()\n"); /* simple test to get a rough idea of the throughput of int64_to_double_cvtsi2sd() */
for (i = 0; i<1000000000; i++){
j_4 = _mm256_add_epi64(j_4,j_inc); /* each step a different input */
v = int64_to_double_based_on_cvtsi2sd(j_4);
v_acc = _mm256_xor_pd(v,v_acc); /* use somehow the results */
}
_mm256_storeu_pd(x,v_acc);
}
break;
case 'E' :{
printf("\nThroughput int64_to_double_full_range()\n"); /* simple test to get a rough idea of the throughput of int64_to_double_full_range() */
for (i = 0; i<1000000000; i++){
j_4 = _mm256_add_epi64(j_4,j_inc);
v = int64_to_double_full_range(j_4);
v_acc = _mm256_xor_pd(v,v_acc);
}
_mm256_storeu_pd(x,v_acc);
}
break;
default : {}
}
printf("v =%23.1f -v =%23.1f v =%23.1f -v =%23.1f \n",x[0],x[1],x[2],x[3]);
}
return 0;
}
Run Code Online (Sandbox Code Playgroud)
mul和之间的吞吐量差异add大于我的预期.
| 归档时间: |
|
| 查看次数: |
2850 次 |
| 最近记录: |