jca*_*zac 8 assembly gcc arm neon
使用arm-elf-gcc-4.5 -O3 -march = armv7-a -mthumb -mfpu = neon -mfloat-abi = softfp编译该程序时:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
return vcombine_f32(vuzp_f32(x, y).val[0], z);
}
Run Code Online (Sandbox Code Playgroud)
......这就是我得到的.注意两个用@ <<<标记的无用指令
_Z5crossRK19__simd128_float32_tS1_:
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #80 @<<<
vswp d17, d16
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80 @<<<
bx
Run Code Online (Sandbox Code Playgroud)
永远不会访问堆栈,但堆栈指针会递减,然后递增相同的量.为什么?
如果我修改原始代码以在序言的结尾和结尾的开头包含asm注释,如下所示:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
asm volatile("# End of prologue");
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
asm volatile("# Start of epilogue");
return res;
}
Run Code Online (Sandbox Code Playgroud)
然后我得到略有不同的版本:
_Z5crossRK19__simd128_float32_tS1_:
sub sp, sp, #80
# End of prologue
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
vswp d17, d16
# Start of epilogue
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80
bx lr
Run Code Online (Sandbox Code Playgroud)
堆栈指针递减/递增显然是序言/结尾的一部分,即使不使用堆栈也会发生.这是符合某些标准,还是gcc优化错误?
编辑:编译器是arm-elf-gcc-4.5(GCC)4.5.0,配置为:/opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf-gcc/work/gcc-4.5.0/ configure --prefix =/opt/local --infodir =/opt/local/share/info --mandir =/opt/local/share/man --target = arm-elf --program-prefix = arm-elf- --prop-suffix = -4.5 --without-included-gettext --enable-obsolete --with-newlib --disable -__ cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx -include-dir =/opt/local/arm-elf/include/c ++/4.5.0/--enable-languages = c,c ++,objc --build = x86_64-apple-darwin10 --enable-fpu
编辑:我设法使用以下C源查明问题.它只会在使用矢量类型的数组作为临时数时发生,例如声明为float32x4x2_t struct { float32x4_t val[2]; },即使这些临时数据都是寄存器.我相信这是一个错误,所以我报告了它.
#include <arm_neon.h>
// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
return vaddq_f32(*v1, *v2);
#if 0
produced assembly:
add:
vldmia r0, {d16-d17}
vldmia r1, {d18-d19}
vadd.f32 q8, q8, q9
vmov r0, r1, d16
vmov r2, r3, d17
bx lr
#endif
}
// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
float32x4x2_t
xxyyzz1=vzipq_f32(*v1, *v1),
xxyyzz2=vzipq_f32(*v2, *v2);
float32x2_t
xx1=vget_low_f32(xxyyzz1.val[0]),
yy1=vget_high_f32(xxyyzz1.val[0]),
zz1=vget_low_f32(xxyyzz1.val[1]),
xx2=vget_low_f32(xxyyzz2.val[0]),
yy2=vget_high_f32(xxyyzz2.val[0]),
zz2=vget_low_f32(xxyyzz2.val[1]);
float32x2_t
x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);
return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
produced assembly:
cross:
vldmia r0, {d18-d19}
vldmia r1, {d16-d17}
vmov q10, q9
vmov q11, q8
vzip.32 q9, q10
vzip.32 q8, q11
vmov d24, d19
vmov d21, d16
vmov d16, d17
vmul.f32 d19, d20, d21
vmul.f32 d17, d24, d22
vmls.f32 d17, d20, d16
vmls.f32 d19, d18, d22
vmul.f32 d16, d18, d16
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #48 @ here
vswp d17, d16
vmov r0, r1, d16
vmov r2, r3, d17
add sp, sp, #48 @ and here
bx lr
#endif
}
Run Code Online (Sandbox Code Playgroud)