如何对SSE XMM,AVX YMM和ZMM寄存器中的所有32位或64位子寄存器求和？

Question

如何对SSE XMM,AVX YMM和ZMM寄存器中的所有32位或64位子寄存器求和？

假设您的任务导致每个浮点子注册表中的小计.我没有看到将小计归结为一个浮点总数的指令.我是否需要将MM寄存器存储在普通的旧存储器中,然后用简单的指令进行求和？

(这些是双精度还是单精度尚未解决,如果我能找到操作码,我计划编码每个CPU变化到即将到来的(？)512位AVX版本.)

Answer 1

wget http://www.agner.org/optimize/vectorclass.zip
unzip vectorclass.zip -d vectorclass
cd vectorclass/

Run Code Online (Sandbox Code Playgroud)

这段代码是GPLv3.

SSE

grep -A11 horizontal_add vectorf128.h

Run Code Online (Sandbox Code Playgroud)

static inline float horizontal_add (Vec4f const & a) {
#if  INSTRSET >= 3  // SSE3
    __m128 t1 = _mm_hadd_ps(a,a);
    __m128 t2 = _mm_hadd_ps(t1,t1);
    return _mm_cvtss_f32(t2);        
#else
    __m128 t1 = _mm_movehl_ps(a,a);
    __m128 t2 = _mm_add_ps(a,t1);
    __m128 t3 = _mm_shuffle_ps(t2,t2,1);
    __m128 t4 = _mm_add_ss(t2,t3);
    return _mm_cvtss_f32(t4);
#endif
--
static inline double horizontal_add (Vec2d const & a) {
#if  INSTRSET >= 3  // SSE3
    __m128d t1 = _mm_hadd_pd(a,a);
    return _mm_cvtsd_f64(t1);        
#else
    __m128  t0 = _mm_castpd_ps(a);
    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
    __m128d t2 = _mm_add_sd(a,t1);
    return _mm_cvtsd_f64(t2);
#endif
}

Run Code Online (Sandbox Code Playgroud)

AVX

grep -A6 horizontal_add vectorf256.h

Run Code Online (Sandbox Code Playgroud)

static inline float horizontal_add (Vec8f const & a) {
    __m256 t1 = _mm256_hadd_ps(a,a);
    __m256 t2 = _mm256_hadd_ps(t1,t1);
    __m128 t3 = _mm256_extractf128_ps(t2,1);
    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
    return _mm_cvtss_f32(t4);        
}
--
static inline double horizontal_add (Vec4d const & a) {
    __m256d t1 = _mm256_hadd_pd(a,a);
    __m128d t2 = _mm256_extractf128_pd(t1,1);
    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
    return _mm_cvtsd_f64(t3);        
}

Run Code Online (Sandbox Code Playgroud)

AVX512

grep -A3 horizontal_add vectorf512.h

Run Code Online (Sandbox Code Playgroud)

static inline float horizontal_add (Vec16f const & a) {
#if defined(__INTEL_COMPILER)
    return _mm512_reduce_add_ps(a);
#else
    return horizontal_add(a.get_low() + a.get_high());
#endif
}

--
static inline double horizontal_add (Vec8d const & a) {
#if defined(__INTEL_COMPILER)
    return _mm512_reduce_add_pd(a);
#else
    return horizontal_add(a.get_low() + a.get_high());
#endif
}

Run Code Online (Sandbox Code Playgroud)

get_high() 和 get_low()

Vec8f get_high() const {
    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm),1));
}
Vec8f get_low() const {
    return _mm512_castps512_ps256(zmm);
}

Vec4d get_low() const {
    return _mm512_castpd512_pd256(zmm);
}

Vec4d get_high() const {
    return _mm512_extractf64x4_pd(zmm,1);
}

Run Code Online (Sandbox Code Playgroud)

对于整数,horizontal_add在vectori128.h,vectori256.h和vectori512.h中查找.

您也可以直接使用Vector Class Library(VCL)

#include <stdio.h>
#define MAX_VECTOR_SIZE 512
#include "vectorclass.h"

int main(void) {

  float x[16]; for(int i=0;i<16;i++) x[i]=i+1;
  Vec4f  v4  =  Vec4f().load(x);
  Vec8f  v8  =  Vec8f().load(x);
  Vec16f v16 = Vec16f().load(x);

  printf("%f %d\n", horizontal_add(v4), 4*5/2);
  printf("%f %d\n", horizontal_add(v8), 8*9/2);
  printf("%f %d\n", horizontal_add(v16), 16*17/2);
}

Run Code Online (Sandbox Code Playgroud)

像这样编译(GCC只有我的KNL对AVX512来说太旧了)

SSE2:     g++  -O3 test.cpp
AVX:      g++  -O3 -mavx test.cpp
AVX512ER: icpc -O3 -xMIC-AVX512 test.cpp

Run Code Online (Sandbox Code Playgroud)

产量

10.000000 10
36.000000 36
136.000000 136

Run Code Online (Sandbox Code Playgroud)

VCL库的一个好处是,如果你使用例如Vec8f和只有SSE2的系统,它将使用SSE两次模拟AVX.

有关如何使用MSVC,ICC,Clang和GCC编译不同指令集的信息,请参阅vectorclass.pdf手册中的"指令集和CPU调度"部分.

归档时间：	8 年，8 月前
查看次数：	1011 次
最近记录：	8 年，8 月前