Lin*_*ios 2 c++ optimization performance g++
想象一下以下情况:
struct Args
{
bool arg;
};
void thing(Args arg)
{
if(arg.arg)
cout<<"arg.arg is true\n";
else
cout<<"arg.arg is false\n";
}
int main()
{
Args a;
a.arg=false;
thing(a);
}
Run Code Online (Sandbox Code Playgroud)
编译器是否足够智能删除switch,if以及else在程序过程中显然永远不会被调用的分支?控制这些陈述的变量必须是const?最后,除了使用preproccesor之外,完全不使用变量是正确的(我对这段代码的想法很害怕)?
只是为了澄清,真实情况是我正在编写一个类,程序员可以选择是否启用某个功能.禁用该功能可以在服务器上节省大量处理时间,使类和服务器之间有一些带宽.我试图弄清楚是否应该使用变量作为构造函数参数,前处理程序派生或其他解决方案.如果禁用该功能,如果启用该功能,我甚至不想考虑逻辑分支.我知道使用preproccessor解决方案会做到这一点,但我想避免大量使用#ifdef,#elseif我希望能够重用一个编译的共享对象.对程序员开放的源是没有问题的,因为这将是开源的.
编辑:我测试了波纹管编译线并查看了组件.我无法完全理解它,但我确实看到了一个跳转指令(jne).这是装配,如果有人可以做到:
.file "blah.cpp"
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "arg.arg is true\n"
.LC1:
.string "arg.arg is false\n"
.text
.p2align 4,,15
.globl _Z5thing4Args
.type _Z5thing4Args, @function
_Z5thing4Args:
.LFB1003:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
subl $24, %esp
cmpb $0, 8(%ebp)
jne .L5
movl $17, 8(%esp)
movl $.LC1, 4(%esp)
movl $_ZSt4cout, (%esp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
leave
.cfi_remember_state
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.p2align 4,,7
.p2align 3
.L5:
.cfi_restore_state
movl $16, 8(%esp)
movl $.LC0, 4(%esp)
movl $_ZSt4cout, (%esp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
leave
.cfi_def_cfa 4, 4
.cfi_restore 5
ret
.cfi_endproc
.LFE1003:
.size _Z5thing4Args, .-_Z5thing4Args
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB1004:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
andl $-16, %esp
subl $16, %esp
movl $17, 8(%esp)
movl $.LC1, 4(%esp)
movl $_ZSt4cout, (%esp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
xorl %eax, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE1004:
.size main, .-main
.p2align 4,,15
.type _GLOBAL__I__Z5thing4Args, @function
_GLOBAL__I__Z5thing4Args:
.LFB1009:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
subl $24, %esp
movl $_ZStL8__ioinit, (%esp)
call _ZNSt8ios_base4InitC1Ev
movl $__dso_handle, 8(%esp)
movl $_ZStL8__ioinit, 4(%esp)
movl $_ZNSt8ios_base4InitD1Ev, (%esp)
call __cxa_atexit
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE1009:
.size _GLOBAL__I__Z5thing4Args, .-_GLOBAL__I__Z5thing4Args
.section .ctors,"aw",@progbits
.align 4
.long _GLOBAL__I__Z5thing4Args
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.weakref _ZL20__gthrw_pthread_oncePiPFvvE,pthread_once
.weakref _ZL27__gthrw_pthread_getspecificj,pthread_getspecific
.weakref _ZL27__gthrw_pthread_setspecificjPKv,pthread_setspecific
.weakref _ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_,pthread_create
.weakref _ZL20__gthrw_pthread_joinmPPv,pthread_join
.weakref _ZL21__gthrw_pthread_equalmm,pthread_equal
.weakref _ZL20__gthrw_pthread_selfv,pthread_self
.weakref _ZL22__gthrw_pthread_detachm,pthread_detach
.weakref _ZL22__gthrw_pthread_cancelm,pthread_cancel
.weakref _ZL19__gthrw_sched_yieldv,sched_yield
.weakref _ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t,pthread_mutex_lock
.weakref _ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t,pthread_mutex_trylock
.weakref _ZL31__gthrw_pthread_mutex_timedlockP15pthread_mutex_tPK8timespec,pthread_mutex_timedlock
.weakref _ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t,pthread_mutex_unlock
.weakref _ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t,pthread_mutex_init
.weakref _ZL29__gthrw_pthread_mutex_destroyP15pthread_mutex_t,pthread_mutex_destroy
.weakref _ZL30__gthrw_pthread_cond_broadcastP14pthread_cond_t,pthread_cond_broadcast
.weakref _ZL27__gthrw_pthread_cond_signalP14pthread_cond_t,pthread_cond_signal
.weakref _ZL25__gthrw_pthread_cond_waitP14pthread_cond_tP15pthread_mutex_t,pthread_cond_wait
.weakref _ZL30__gthrw_pthread_cond_timedwaitP14pthread_cond_tP15pthread_mutex_tPK8timespec,pthread_cond_timedwait
.weakref _ZL28__gthrw_pthread_cond_destroyP14pthread_cond_t,pthread_cond_destroy
.weakref _ZL26__gthrw_pthread_key_createPjPFvPvE,pthread_key_create
.weakref _ZL26__gthrw_pthread_key_deletej,pthread_key_delete
.weakref _ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t,pthread_mutexattr_init
.weakref _ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti,pthread_mutexattr_settype
.weakref _ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t,pthread_mutexattr_destroy
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",@progbits
Run Code Online (Sandbox Code Playgroud)
编辑:我asm("#aksdjfh")按照建议添加后更多地查看了程序集,我发现编译器没有摆脱它.这是#ifdef唯一的选择吗?或者jne指令是否可以有效地忽略性能?
亲自尝试一下:
$ g++ -O3 -S test.cpp -o test.s
Run Code Online (Sandbox Code Playgroud)
-O3打开优化,-S告诉编译器在生成汇编代码后停止,并-o选择放置输出的位置.然后,您可以检查"test.s"文件,看看它是否进行了优化.显然这需要一些装配知识.您可能也想要,-masm=intel如果您像我一样,发现AT&T语法不可读并且更喜欢Intel语法.
添加类似于asm("# this is something")代码的行可能会有所帮助.这些将在生成的程序集中显示为注释,这可以使您更容易识别您感兴趣的部分.
在我的机器上,GCC 4.8的快照似乎并没有优化死代码.我在每个分支中添加了其中一个asm注释来识别它们并生成了这个:
.file "test.cpp"
.intel_syntax noprefix
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "arg.arg is true\n"
.LC1:
.string "arg.arg is false\n"
.text
.p2align 4,,15
.globl _Z5thing4Args
.type _Z5thing4Args, @function
_Z5thing4Args:
.LFB1215:
.cfi_startproc
sub esp, 28
.cfi_def_cfa_offset 32
cmp BYTE PTR [esp+32], 0
jne .L6
#APP
# 13 "test.cpp" 1
This is the false branch
# 0 "" 2
#NO_APP
mov DWORD PTR [esp+8], 17
mov DWORD PTR [esp+4], OFFSET FLAT:.LC1
mov DWORD PTR [esp], OFFSET FLAT:_ZSt4cout
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
add esp, 28
.cfi_remember_state
.cfi_def_cfa_offset 4
ret
.p2align 4,,7
.p2align 3
.L6:
.cfi_restore_state
#APP
# 10 "test.cpp" 1
This is the true branch
# 0 "" 2
#NO_APP
mov DWORD PTR [esp+8], 16
mov DWORD PTR [esp+4], OFFSET FLAT:.LC0
mov DWORD PTR [esp], OFFSET FLAT:_ZSt4cout
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
add esp, 28
.cfi_def_cfa_offset 4
ret
.cfi_endproc
.LFE1215:
.size _Z5thing4Args, .-_Z5thing4Args
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB1216:
.cfi_startproc
push ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
mov ebp, esp
.cfi_def_cfa_register 5
and esp, -16
sub esp, 16
mov BYTE PTR [esp], 0
call _Z5thing4Args
xor eax, eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE1216:
.size main, .-main
.p2align 4,,15
.type _GLOBAL__sub_I__Z5thing4Args, @function
_GLOBAL__sub_I__Z5thing4Args:
.LFB1367:
.cfi_startproc
sub esp, 28
.cfi_def_cfa_offset 32
mov DWORD PTR [esp], OFFSET FLAT:_ZStL8__ioinit
call _ZNSt8ios_base4InitC1Ev
mov DWORD PTR [esp+8], OFFSET FLAT:__dso_handle
mov DWORD PTR [esp+4], OFFSET FLAT:_ZStL8__ioinit
mov DWORD PTR [esp], OFFSET FLAT:_ZNSt8ios_base4InitD1Ev
call __cxa_atexit
add esp, 28
.cfi_def_cfa_offset 4
ret
.cfi_endproc
.LFE1367:
.size _GLOBAL__sub_I__Z5thing4Args, .-_GLOBAL__sub_I__Z5thing4Args
.section .init_array,"aw"
.align 4
.long _GLOBAL__sub_I__Z5thing4Args
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.hidden __dso_handle
.ident "GCC: (GNU) 4.8.0 20120311 (experimental)"
.section .note.GNU-stack,"",@progbits
Run Code Online (Sandbox Code Playgroud)
如果您查找这些注释,您会发现它们都会调用某个std::cout成员函数.
发生这种情况的原因在于,该功能在其他翻译单元上可见:如果您现在nasty.cpp使用声明void thing(Args arg);和带有值的调用创建文件true,则代码必须存在.
所以我进一步尝试了一下.如果我将函数标记为static,意味着它是该转换单元的内部函数,GCC确实优化了死代码:
.file "test.cpp"
.intel_syntax noprefix
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "arg.arg is false\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB1216:
.cfi_startproc
push ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
mov ebp, esp
.cfi_def_cfa_register 5
and esp, -16
sub esp, 16
#APP
# 13 "test.cpp" 1
This is the false branch
# 0 "" 2
#NO_APP
mov DWORD PTR [esp+4], OFFSET FLAT:.LC0
mov DWORD PTR [esp], OFFSET FLAT:_ZSt4cout
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
xor eax, eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE1216:
.size main, .-main
.p2align 4,,15
.type _GLOBAL__sub_I_main, @function
_GLOBAL__sub_I_main:
.LFB1367:
.cfi_startproc
sub esp, 28
.cfi_def_cfa_offset 32
mov DWORD PTR [esp], OFFSET FLAT:_ZStL8__ioinit
call _ZNSt8ios_base4InitC1Ev
mov DWORD PTR [esp+8], OFFSET FLAT:__dso_handle
mov DWORD PTR [esp+4], OFFSET FLAT:_ZStL8__ioinit
mov DWORD PTR [esp], OFFSET FLAT:_ZNSt8ios_base4InitD1Ev
call __cxa_atexit
add esp, 28
.cfi_def_cfa_offset 4
ret
.cfi_endproc
.LFE1367:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 4
.long _GLOBAL__sub_I_main
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.hidden __dso_handle
.ident "GCC: (GNU) 4.8.0 20120311 (experimental)"
.section .note.GNU-stack,"",@progbits
Run Code Online (Sandbox Code Playgroud)
你不会在该代码中找到"这是真正的分支".另外,请注意假分支如何移入main函数并且thing函数不再存在.GCC只是简单地编写了函数的代码并且没有费心生成它,因为它不会在我添加的任何其他地方使用static.
如果我将其标记为inline,它仍然可以在外面看到,但显然这足以让GCC对其进行优化.但是,如果执行此操作,则必须确保其他翻译单元看到相同的定义,以便可以根据需要为每个单元生成代码.