用sse累积整数向量

Question

用sse累积整数向量

我试图改变这个代码来处理std::vector<int>.

float accumulate(const std::vector<float>& v)
{
 // copy the length of v and a pointer to the data onto the local stack
 const size_t N = v.size();
 const float* p = (N > 0) ? &v.front() : NULL;

 __m128 mmSum = _mm_setzero_ps();
 size_t i = 0;

 // unrolled loop that adds up 4 elements at a time
 for(; i < ROUND_DOWN(N, 4); i+=4)
 {
  mmSum = _mm_add_ps(mmSum, _mm_loadu_ps(p + i));
 }

 // add up single values until all elements are covered
 for(; i < N; i++)
 {
  mmSum = _mm_add_ss(mmSum, _mm_load_ss(p + i));
 }

 // add up the four float values from mmSum into a single value and return
 mmSum = _mm_hadd_ps(mmSum, mmSum);
 mmSum = _mm_hadd_ps(mmSum, mmSum);
 return _mm_cvtss_f32(mmSum);
}

Run Code Online (Sandbox Code Playgroud)

参考:http://fastcpp.blogspot.com.au/2011/04/how-to-process-stl-vector-using-sse.html

我换_mm_setzero_ps到_mm_setzero_si128,_mm_loadu_ps到mm_loadl_epi64和_mm_add_ps到_mm_add_epi64.

我收到此错误:

error: cannot convert ‘const int*’ to ‘const __m128i* {aka const __vector(2) long long int*}’ for argument ‘1’ to ‘__m128i _mm_loadl_epi64(const __m128i*)’
         mmSum = _mm_add_epi64(mmSum, _mm_loadl_epi64(p + i + 0));

Run Code Online (Sandbox Code Playgroud)

我是这个领域的新手.学习这些东西有什么好的资源吗？

Answer 1

Pau*_*l R 6

这是int我刚刚放在一起的版本：

#include <iostream>
#include <vector>

#include <smmintrin.h>  // SSE4

#define ROUND_DOWN(m, n) ((m) & ~((n) - 1))

static int accumulate(const std::vector<int>& v)
{
    // copy the length of v and a pointer to the data onto the local stack
    const size_t N = v.size();
    const int* p = (N > 0) ? &v.front() : NULL;

    __m128i mmSum = _mm_setzero_si128();
    int sum = 0;
    size_t i = 0;

    // unrolled loop that adds up 4 elements at a time
    for(; i < ROUND_DOWN(N, 4); i+=4)
    {
        mmSum = _mm_add_epi32(mmSum, _mm_loadu_si128((__m128i *)(p + i)));
    }

    // add up the four int values from mmSum into a single value
    mmSum = _mm_hadd_epi32(mmSum, mmSum);
    mmSum = _mm_hadd_epi32(mmSum, mmSum);
    sum = _mm_extract_epi32(mmSum, 0);

    // add up single values until all elements are covered
    for(; i < N; i++)
    {
        sum += p[i];
    }

    return sum;
}

int main()
{
    std::vector<int> v;

    for (int i = 0; i < 10; ++i)
    {
        v.push_back(i);
    }

    int sum = accumulate(v);

    std::cout << sum << std::endl;

    return 0;
}

Run Code Online (Sandbox Code Playgroud)

编译并运行：

$ g++ -Wall -msse4 -O3 accumulate.cpp && ./a.out 
45

Run Code Online (Sandbox Code Playgroud)

Answer 2

Z b*_*son 5

进行此操作的理想方法是让编译器自动对代码进行矢量化处理，并使代码简单易读。你不应该不需要任何更多的

int sum = 0;
for(int i=0; i<v.size(); i++) sum += v[i];

Run Code Online (Sandbox Code Playgroud)

您所指向的链接http://fastcpp.blogspot.com.au/2011/04/how-to-process-stl-vector-using-sse.html似乎无法理解如何使编译器向量化码。

对于链接使用的浮点，您需要知道的是浮点算术不是关联的，因此取决于您执行约简的顺序。除非您告诉GCC，MSVC和Clang使用不同的浮点模型，否则GCC，MSVC和Clang不会对其进行自动矢量化，否则结果可能取决于您的硬件。但是，ICC默认使用关联浮点数学运算，因此它将使用eg将向量矢量化-O3。

除非允许关联数学，否则GCC，MSVC和Clang不仅不会向量化，而且它们也不会展开循环以允许部分求和，从而克服求和的延迟。在这种情况下，无论如何，只有Clang和ICC才会展开部分款项。Clang展开四次，ICC展开两次。

使用GCC启用关联浮点算术的一种方法是使用-Ofast标志。与MSVC一起使用/fp:fast

我使用GCC 4.9.2，XeonE5-1620（IVB）@ 3.60GHz，Ubuntu 15.04测试了以下代码。

-O3 -mavx -fopenmp                       0.93 s
-Ofast -mavx -fopenmp                    0.19 s
-Ofast -mavx -fopenmp -funroll-loops     0.19 s

Run Code Online (Sandbox Code Playgroud)

这大约是加速的五倍。虽然，GCC会展开循环八次，但不会进行独立的部分求和（请参见下面的程序集）。这就是展开版本没有更好的原因。

我仅将OpenMP用于其方便的跨平台/编译器计时功能：omp_get_wtime()。

自动向量化的另一个优点是，只需启用编译器开关即可将其用于AVX -mavx。否则，如果您要使用AVX，则必须重写代码以使用AVX内在函数，并且可能不得不在SO上问另一个有关如何执行此操作的问题。

因此，当前唯一会自动向量化循环并展开为四个部分和的编译器是Clang。请参阅此答案末尾的代码和汇编。

这是我用来测试性能的代码

#include <stdio.h>
#include <omp.h>
#include <vector>

float sumf(float *x, int n)
{
  float sum = 0;
  for(int i=0; i<n; i++) sum += x[i];
  return sum;
}

#define N 10000 // the link used this value
int main(void)
{
  std::vector<float> x;
  for(int i=0; i<N; i++) x.push_back(1 -2*(i%2==0));
  //float x[N]; for(int i=0; i<N; i++) x[i] = 1 -2*(i%2==0);                                                                                                                                                        
  float sum = 0;
  sum += sumf(x.data(),N);
  double dtime = -omp_get_wtime();
  for(int r=0; r<100000; r++) {
    sum += sumf(x.data(),N);
  }
  dtime +=omp_get_wtime();
  printf("sum %f time %f\n", sum, dtime);
}

Run Code Online (Sandbox Code Playgroud)

编辑：

我应该听取自己的建议，然后看一下组装件。

的主循环-O3。很明显，它仅执行标量和。

.L3:
    vaddss  (%rdi), %xmm0, %xmm0
    addq    $4, %rdi
    cmpq    %rax, %rdi
    jne .L3

Run Code Online (Sandbox Code Playgroud)

的主循环-Ofast。它执行矢量和，但不展开。

.L8:
    addl    $1, %eax
    vaddps  (%r8), %ymm1, %ymm1
    addq    $32, %r8
    cmpl    %eax, %ecx
    ja  .L8

Run Code Online (Sandbox Code Playgroud)

的主循环-O3 -funroll-loops。展开为8倍的矢量和

.L8:
    vaddps  (%rax), %ymm1, %ymm2
    addl    $8, %ebx
    addq    $256, %rax
    vaddps  -224(%rax), %ymm2, %ymm3
    vaddps  -192(%rax), %ymm3, %ymm4
    vaddps  -160(%rax), %ymm4, %ymm5
    vaddps  -128(%rax), %ymm5, %ymm6
    vaddps  -96(%rax), %ymm6, %ymm7
    vaddps  -64(%rax), %ymm7, %ymm8
    vaddps  -32(%rax), %ymm8, %ymm1
    cmpl    %ebx, %r9d
    ja  .L8

Run Code Online (Sandbox Code Playgroud)

编辑：

将以下代码放入Clang 3.7（-O3 -fverbose-asm -mavx）

float sumi(int *x)
{
  x = (int*)__builtin_assume_aligned(x, 64);
  int sum = 0;
  for(int i=0; i<2048; i++) sum += x[i];
  return sum;
}

Run Code Online (Sandbox Code Playgroud)

产生以下程序集。请注意，它已向量化为四个独立的部分和。

sumi(int*):                              # @sumi(int*)
    vpxor   xmm0, xmm0, xmm0
    xor eax, eax
    vpxor   xmm1, xmm1, xmm1
    vpxor   xmm2, xmm2, xmm2
    vpxor   xmm3, xmm3, xmm3
.LBB0_1:                                # %vector.body
    vpaddd  xmm0, xmm0, xmmword ptr [rdi + 4*rax]
    vpaddd  xmm1, xmm1, xmmword ptr [rdi + 4*rax + 16]
    vpaddd  xmm2, xmm2, xmmword ptr [rdi + 4*rax + 32]
    vpaddd  xmm3, xmm3, xmmword ptr [rdi + 4*rax + 48]
    vpaddd  xmm0, xmm0, xmmword ptr [rdi + 4*rax + 64]
    vpaddd  xmm1, xmm1, xmmword ptr [rdi + 4*rax + 80]
    vpaddd  xmm2, xmm2, xmmword ptr [rdi + 4*rax + 96]
    vpaddd  xmm3, xmm3, xmmword ptr [rdi + 4*rax + 112]
    add rax, 32
    cmp rax, 2048
    jne .LBB0_1
    vpaddd  xmm0, xmm1, xmm0
    vpaddd  xmm0, xmm2, xmm0
    vpaddd  xmm0, xmm3, xmm0
    vpshufd xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
    vpaddd  xmm0, xmm0, xmm1
    vphaddd xmm0, xmm0, xmm0
    vmovd   eax, xmm0
    vxorps  xmm0, xmm0, xmm0
    vcvtsi2ss   xmm0, xmm0, eax
    ret

Run Code Online (Sandbox Code Playgroud)

归档时间：	10 年，5 月前
查看次数：	2493 次
最近记录：	10 年，5 月前