jaw*_*awa 5 c++ sse visual-studio-2010 visual-studio-2012
我正在使用SSE内在函数计算数组的均值和方差.基本上,这是值及其平方的总和,可以在以下程序中说明:
int main( int argc, const char* argv[] )
{
union u
{
__m128 m;
float f[4];
} x;
// Allocate memory and initialize data: [1,2,3,...stSize+1]
const size_t stSize = 1024;
float *pData = (float*) _aligned_malloc(stSize*sizeof(float), 32);
for ( size_t s = 0; s < stSize; ++s ) {
pData[s] = s+1;
}
// Sum and sum of squares
{
// Accumlation using SSE intrinsics
__m128 mEX = _mm_set_ps1(0.f);
__m128 mEXX = _mm_set_ps1(0.f);
for ( size_t s = 0; s < stSize; s+=4 )
{
__m128 m = _mm_load_ps(pData + s);
mEX = _mm_add_ps(mEX, m);
mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
}
// Final reduction
x.m = mEX;
double dEX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
x.m = mEXX;
double dEXX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
std::cout << "Sum expected: " << (stSize * stSize + stSize) / 2 << std::endl;
std::cout << "EX: " << dEX << std::endl;
std::cout << "Sum of squares expected: " << 1.0/6.0 * stSize * (stSize + 1) * (2 * stSize + 1) << std::endl;
std::cout << "EXX: " << dEXX << std::endl;
}
// Clean up
_aligned_free(pData);
}
Run Code Online (Sandbox Code Playgroud)
现在,当我在调试模式下编译并运行程序时,我得到以下(和正确的)输出:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
Run Code Online (Sandbox Code Playgroud)
但是,在发布模式下编译和运行程序会产生以下(和错误)结果:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.49272e+012
Run Code Online (Sandbox Code Playgroud)
更改累积顺序,即EXX在EX之前更新,结果正常:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
Run Code Online (Sandbox Code Playgroud)
看起来像'适得其反的'编译器优化或为什么执行顺序相关?这是一个已知的错误?
编辑:
我只是看了汇编输出.这是我得到的(只有相关部分).对于使用/arch:AVX编译器标志的发布版本,我们有:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
vmovaps xmm1, XMMWORD PTR __xmm@0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
vmovaps xmm2, xmm1
npad 12
$LL3@main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
vmovaps xmm0, xmm1
; 76 : mEX = _mm_add_ps(mEX, m);
vaddps xmm1, xmm1, XMMWORD PTR [rax]
add rax, 16
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
vmulps xmm0, xmm0, xmm0
vaddps xmm2, xmm0, xmm2
dec rcx
jne SHORT $LL3@main
Run Code Online (Sandbox Code Playgroud)
这显然是错误的,因为这(1)保存累积的EX结果(xmm1)在xmm0(2)中累加EX与当前值(XMMWORD PTR [rax])和(3)累积在EXX(xmm2)中累积的EX结果的平方先前保存xmm0.
相比之下,没有/arch:AVX看起来很好的版本和预期:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
movaps xmm1, XMMWORD PTR __xmm@0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
movaps xmm2, xmm1
npad 10
$LL3@main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
movaps xmm0, XMMWORD PTR [rax]
add rax, 16
dec rcx
; 76 : mEX = _mm_add_ps(mEX, m);
addps xmm1, xmm0
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
mulps xmm0, xmm0
addps xmm2, xmm0
jne SHORT $LL3@main
Run Code Online (Sandbox Code Playgroud)
这真的看起来像个bug.任何人都可以用不同的编译器版本来确认或驳斥这个问题吗?(我目前无权更新编译器)
我建议使用相应的 SSE 指令,而不是手动执行水平加法_mm_hadd_ps
// Final reduction
__m128 sum1 = _mm_hadd_ps(mEX, mEXX);
// == {EX[0]+EX[1], EX[2]+EX[3], EXX[0]+EXX[1], EXX[2]+EXX[3]}
// final sum and conversion to double:
__m128d sum2 = _mm_cvtps_pd(_mm_hadd_ps(sum1, sum1));
// result vector:
double dEX_EXX[2]; // (I don't know MSVC syntax for stack aligned arrays)
// store register to stack: (should be _mm_store_pd, if the array is aligned)
_mm_storeu_pd(dEX_EXX, sum2);
std::cout << "EX: " << dEX_EXX[0] << "\nEXX: " << dEX_EXX[1] << std::endl;
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
235 次 |
| 最近记录: |