jsh*_*py8 5 x86 assembly masm irvine32
我正在使用Visual Studio 2013 Ultimate在MASM中编写汇编语言(x86).我试图使用数组来计算使用数组的n个元素的Fibonacci序列.换句话说,我试图去一个数组元素,获取它之前的两个元素,添加它们,并将结果存储在另一个数组中.
我无法设置索引寄存器以使其工作.
我的程序设置如下:
TITLE fibonacci.asm
INCLUDE Irvine32.inc
.data
fibInitial BYTE 0, 1, 2, 3, 4, 5, 6
fibComputed BYTE 5 DUP(0)
.code
main PROC
MOVZX si, fibInitial
MOVZX di, fibComputed
MOV cl, LENGTHOF fibInitial
L1:
MOV ax, [si - 1]
MOV dx, [si - 2]
MOV bp, ax + dx
MOV dl, TYPE fibInitial
MOVZX si, dl
MOV [edi], bp
MOV dh, TYPE fibComputed
MOVZX di, dl
loop L1
exit
main ENDP
END main
Run Code Online (Sandbox Code Playgroud)
我无法编译这个,因为该行的错误消息"错误A2031:必须是索引或基址寄存器" MOV ebp, ax + dx.但是,我确定我忽略了其他逻辑错误.
相关:Code-golf打印Fib的前1000位数字(10**9): 我的x86 asm使用扩展精度adc循环回答,并将二进制转换为字符串.内环针对速度进行了优化,其他部件针对尺寸进行了优化.
计算Fibonacci序列只需要保留两个状态:当前和前一个元素.我不知道你要做什么fibInitial,除了计算它的长度.这不是perl你去的地方for $n (0..5).
我知道你只是在学习asm,但我仍然会谈论性能.没有多少理由学习asm 而不知道什么是快速的,什么不是.如果您不需要性能,请让编译器从C源代码中为您创建asm.另请参阅https://stackoverflow.com/tags/x86/info上的其他链接
为状态使用寄存器简化了a[-1]计算时需要查看的问题a[1].你从curr=1,prev=0开始,然后开始a[0] = curr.为了产生"现代的"起始零序的斐波纳契数,从,开始curr=0,prev=1.
幸运的是,我最近只考虑了一个有效的fibonacci代码循环,所以我花时间写了一个完整的函数.请参阅下面的展开和矢量化版本(保存存储指令,但即使在编译32位CPU时也能快速生成64位整数):
; fib.asm
;void fib(int32_t *dest, uint32_t count);
; not-unrolled version. See below for a version which avoids all the mov instructions
global fib
fib:
; 64bit SysV register-call ABI:
; args: rdi: output buffer pointer. esi: count (and you can assume the upper32 are zeroed, so using rsi is safe)
;; locals: rsi: endp
;; eax: current edx: prev
;; ecx: tmp
;; all of these are caller-saved in the SysV ABI, like r8-r11
;; so we can use them without push/pop to save/restore them.
;; The Windows ABI is different.
test esi, esi ; test a reg against itself instead of cmp esi, 0
jz .early_out ; count == 0.
mov eax, 1 ; current = 1
xor edx, edx ; prev = 0
lea rsi, [rdi + rsi * 4] ; endp = &out[count]; // loop-end pointer
;; lea is very useful for combining add, shift, and non-destructive operation
;; this is equivalent to shl rsi, 4 / add rsi, rdi
align 16
.loop: ; do {
mov [rdi], eax ; *buf = current
add rdi, 4 ; buf++
lea ecx, [rax + rdx] ; tmp = curr+prev = next_cur
mov edx, eax ; prev = curr
mov eax, ecx ; curr=tmp
;; see below for an unrolled version that doesn't need any reg->reg mov instructions
; you might think this would be faster:
; add edx, eax ; but it isn't
; xchg eax, edx ; This is as slow as 3 mov instructions, but we only needed 2 thanks to using lea
cmp rdi, rsi ; } while(buf < endp);
jb .loop ; jump if (rdi BELOW rsi). unsigned compare
;; the LOOP instruction is very slow, avoid it
.early_out:
ret
Run Code Online (Sandbox Code Playgroud)
另一种循环条件可能是
dec esi ; often you'd use ecx for counts, but we had it in esi
jnz .loop
Run Code Online (Sandbox Code Playgroud)
AMD CPU可以融合cmp/branch,但不能dec/branch.英特尔CPU也可以进行宏保险 dec/jnz.(或签名小于零/大于零). dec/inc不要更新Carry标志,因此您不能在无符号上方/下方使用它们ja/jb.我认为这个想法是你可以adc在一个循环中做一个(带有进位),使用inc/dec循环计数器来不干扰进位标志,但是部分标志减速会使现代CPU上的这个变坏.
lea ecx, [eax + edx]需要一个额外的字节(地址大小前缀),这就是我使用32位dest和64位地址的原因.(这些是lea64位模式下的默认操作数大小).没有直接影响速度,只是间接通过代码大小.
另一个循环体可以是:
mov ecx, eax ; tmp=curr. This stays true after every iteration
.loop:
mov [rdi], ecx
add ecx, edx ; tmp+=prev ;; shorter encoding than lea
mov edx, eax ; prev=curr
mov eax, ecx ; curr=tmp
Run Code Online (Sandbox Code Playgroud)
展开循环以进行更多迭代将意味着更少的改组.mov您只需跟踪哪个寄存器保存哪个变量,而不是指令.即你处理具有一种寄存器重命名的分配.
.loop: ;; on entry: ; curr:eax prev:edx
mov [rdi], eax ; store curr
add edx, eax ; curr:edx prev:eax
.oddentry:
mov [rdi + 4], edx ; store curr
add eax, edx ; curr:eax prev:edx
;; we're back to our starting state, so we can loop
add rdi, 8
cmp rdi, rsi
jb .loop
Run Code Online (Sandbox Code Playgroud)
展开的东西是你需要清理剩下的任何奇怪的迭代.两次幂的展开因素可以使清理循环稍微容易一些,但是添加12并不比添加16更快.(请参阅本文的上一版本,lea以获取curr + prev在第3版中生成的愚蠢的unroll-by-3版本注册,因为我没有意识到你实际上并不需要temp.感谢rcgldr捕获它.)
请参阅下面的完整工作展开版本,处理任何计数.
测试前端(此版本中的新功能:用于检测写入缓冲区末尾的asm错误的canary元素.)
// fib-main.c
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
void fib(uint32_t *buf, uint32_t count);
int main(int argc, const char *argv[]) {
uint32_t count = 15;
if (argc > 1) {
count = atoi(argv[1]);
}
uint32_t buf[count+1]; // allocated on the stack
// Fib overflows uint32 at count = 48, so it's not like a lot of space is useful
buf[count] = 0xdeadbeefUL;
// uint32_t count = sizeof(buf)/sizeof(buf[0]);
fib(buf, count);
for (uint32_t i ; i < count ; i++){
printf("%u ", buf[i]);
}
putchar('\n');
if (buf[count] != 0xdeadbeefUL) {
printf("fib wrote past the end of buf: sentinel = %x\n", buf[count]);
}
}
Run Code Online (Sandbox Code Playgroud)
此代码完全正常工作和测试(除非我错过了将本地文件中的更改复制回答案>.<):
peter@tesla:~/src/SO$ yasm -f elf64 fib.asm && gcc -std=gnu11 -g -Og fib-main.c fib.o
peter@tesla:~/src/SO$ ./a.out 48
1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 1597 2584 4181 6765 10946 17711 28657 46368 75025 121393 196418 317811 514229 832040 1346269 2178309 3524578 5702887 9227465 14930352 24157817 39088169 63245986 102334155 165580141 267914296 433494437 701408733 1134903170 1836311903 2971215073 512559680
Run Code Online (Sandbox Code Playgroud)
再次感谢rcgldr让我思考如何在循环设置中处理奇数与偶数,而不是在最后进行清理迭代.
我选择了无分支设置代码,它将4*count%2添加到起始指针.这可能是零,但添加零比分支更便宜,看看我们是否应该.Fibonacci序列非常快速地溢出寄存器,因此保持序言代码的紧密和高效非常重要,而不仅仅是循环内的代码.(如果我们正在进行优化,我们希望针对很多长度较短的呼叫进行优化).
; 64bit SysV register-call ABI
; args: rdi: output buffer pointer. rsi: count
;; locals: rsi: endp
;; eax: current edx: prev
;; ecx: tmp
;; all of these are caller-saved in the SysV ABI, like r8-r11
;; so we can use them without push/pop to save/restore them.
;; The Windows ABI is different.
;void fib(int32_t *dest, uint32_t count); // unrolled version
global fib
fib:
cmp esi, 1
jb .early_out ; count below 1 (i.e. count==0, since it's unsigned)
mov eax, 1 ; current = 1
mov [rdi], eax
je .early_out ; count == 1, flags still set from cmp
;; need this 2nd early-out because the loop always does 2 iterations
;;; branchless handling of odd counts:
;;; always do buf[0]=1, then start the loop from 0 or 1
;;; Writing to an address you just wrote to is very cheap
;;; mov/lea is about as cheap as best-case for branching (correctly-predicted test/jcc for count%2==0)
;;; and saves probably one unconditional jump that would be needed either in the odd or even branch
mov edx, esi ;; we could save this mov by using esi for prev, and loading the end pointer into a different reg
and edx, eax ; prev = count & 1 = count%2
lea rsi, [rdi + rsi*4] ; end pointer: same regardless of starting at 0 or 1
lea rdi, [rdi + rdx*4] ; buf += count%2
;; even count: loop starts at buf[0], with curr=1, prev=0
;; odd count: loop starts at buf[1], with curr=1, prev=1
align 16 ;; the rest of this func is just *slightly* longer than 16B, so there's a lot of padding. Tempting to omit this alignment for CPUs with a loop buffer.
.loop: ;; do {
mov [rdi], eax ;; *buf = current
; on loop entry: curr:eax prev:edx
add edx, eax ; curr:edx prev:eax
;.oddentry: ; unused, we used a branchless sequence to handle odd counts
mov [rdi+4], edx
add eax, edx ; curr:eax prev:edx
;; back to our starting arrangement
add rdi, 8 ;; buf++
cmp rdi, rsi ;; } while(buf < endp);
jb .loop
; dec esi ; set up for this version with sub esi, edx; instead of lea
; jnz .loop
.early_out:
ret
Run Code Online (Sandbox Code Playgroud)
要产生零起始序列,请执行此操作
curr=count&1; // and esi, 1
buf += curr; // lea [rdi], [rdi + rsi*4]
prev= 1 ^ curr; // xor eax, esi
Run Code Online (Sandbox Code Playgroud)
而不是当前
curr = 1;
prev = count & 1;
buf += count & 1;
Run Code Online (Sandbox Code Playgroud)
我们也可以mov使用esito hold来保存两个版本的指令prev,现在prev依赖于count.
;; loop prologue for sequence starting with 1 1 2 3
;; (using different regs and optimized for size by using fewer immediates)
mov eax, 1 ; current = 1
cmp esi, eax
jb .early_out ; count below 1
mov [rdi], eax
je .early_out ; count == 1, flags still set from cmp
lea rdx, [rdi + rsi*4] ; endp
and esi, eax ; prev = count & 1
lea rdi, [rdi + rsi*4] ; buf += count & 1
;; eax:curr esi:prev rdx:endp rdi:buf
;; end of old code
;; loop prologue for sequence starting with 0 1 1 2
cmp esi, 1
jb .early_out ; count below 1, no stores
mov [rdi], 0 ; store first element
je .early_out ; count == 1, flags still set from cmp
lea rdx, [rdi + rsi*4] ; endp
mov eax, 1 ; prev = 1
and esi, eax ; curr = count&1
lea rdi, [rdi + rsi*4] ; buf += count&1
xor eax, esi ; prev = 1^curr
;; ESI:curr EAX:prev (opposite of other setup)
;;
Run Code Online (Sandbox Code Playgroud)
;; optimized for code size, NOT speed. Prob. could be smaller, esp. if we want to keep the loop start aligned, and jump between before and after it.
;; most of the savings are from avoiding mov reg, imm32,
;; and from counting down the loop counter, instead of checking an end-pointer.
;; loop prologue for sequence starting with 0 1 1 2
xor edx, edx
cmp esi, 1
jb .early_out ; count below 1, no stores
mov [rdi], edx ; store first element
je .early_out ; count == 1, flags still set from cmp
xor eax, eax ; movzx after setcc would be faster, but one more byte
shr esi, 1 ; two counts per iteration, divide by two
;; shift sets CF = the last bit shifted out
setc al ; curr = count&1
setnc dl ; prev = !(count&1)
lea rdi, [rdi + rax*4] ; buf+= count&1
;; extra uop or partial register stall internally when reading eax after writing al, on Intel (except P4 & silvermont)
;; EAX:curr EDX:prev (same as 1 1 2 setup)
;; even count: loop starts at buf[0], with curr=0, prev=1
;; odd count: loop starts at buf[1], with curr=1, prev=0
.loop:
...
dec esi ; 1B smaller than 64b cmp, needs count/2 in esi
jnz .loop
.early_out:
ret
Run Code Online (Sandbox Code Playgroud)
Fibonacci序列不是特别可并行化的.没有简单的方法从F(i)和F(i-4)获得F(i + 4),或类似的东西.我们可以用向量做的事情是存储内存减少.从...开始:
a = [f3 f2 f1 f0 ] -> store this to buf
b = [f2 f1 f0 f-1]
Run Code Online (Sandbox Code Playgroud)
然后a+=b; b+=a; a+=b; b+=a;产生:
a = [f7 f6 f5 f4 ] -> store this to buf
b = [f6 f5 f4 f3 ]
Run Code Online (Sandbox Code Playgroud)
当使用两个64位整数打包到128b矢量时,这不那么愚蠢.即使在32位代码中,也可以使用SSE进行64位整数运算.
此答案的先前版本有一个未完成的打包32位矢量版本,无法正确处理count%4 != 0.为了加载序列的前4个值,我使用了pmovzxbd所以当我只能使用4B时我不需要16B的数据.将序列的第一个-1 .. 1值转换为向量寄存器要容易得多,因为只有一个非零值可以加载和随机播放.
;void fib64_sse(uint64_t *dest, uint32_t count);
; using SSE for fewer but larger stores, and for 64bit integers even in 32bit mode
global fib64_sse
fib64_sse:
mov eax, 1
movd xmm1, eax ; xmm1 = [0 1] = [f0 f-1]
pshufd xmm0, xmm1, 11001111b ; xmm0 = [1 0] = [f1 f0]
sub esi, 2
jae .entry ; make the common case faster with fewer branches
;; could put the handling for count==0 and count==1 right here, with its own ret
jmp .cleanup
align 16
.loop: ; do {
paddq xmm0, xmm1 ; xmm0 = [ f3 f2 ]
.entry:
;; xmm1: [ f0 f-1 ] ; on initial entry, count already decremented by 2
;; xmm0: [ f1 f0 ]
paddq xmm1, xmm0 ; xmm1 = [ f4 f3 ] (or [ f2 f1 ] on first iter)
movdqu [rdi], xmm0 ; store 2nd last compute result, ready for cleanup of odd count
add rdi, 16 ; buf += 2
sub esi, 2
jae .loop ; } while((count-=2) >= 0);
.cleanup:
;; esi <= 0 : -2 on the count=0 special case, otherwise -1 or 0
;; xmm1: [ f_rc f_rc-1 ] ; rc = count Rounded down to even: count & ~1
;; xmm0: [ f_rc+1 f_rc ] ; f(rc+1) is the value we need to store if count was odd
cmp esi, -1
jne .out ; this could be a test on the Parity flag, with no extra cmp, if we wanted to be really hard to read and need a big comment explaining the logic
;; xmm1 = [f1 f0]
movhps [rdi], xmm1 ; store the high 64b of xmm0. There is no integer version of this insn, but that doesn't matter
.out:
ret
Run Code Online (Sandbox Code Playgroud)
没有必要进一步展开这一点,dep链延迟限制了吞吐量,因此我们总是可以平均每个周期存储一个元素.减少uops中的循环开销可以帮助超线程,但这很小.
正如您所看到的,即使在两个展开时处理所有角落情况也非常复杂.它需要额外的启动开销,即使您正在尝试优化它以将其保持在最低限度.最终有很多条件分支很容易.
更新主要:
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdlib.h>
#ifdef USE32
void fib(uint32_t *buf, uint32_t count);
typedef uint32_t buftype_t;
#define FMTx PRIx32
#define FMTu PRIu32
#define FIB_FN fib
#define CANARY 0xdeadbeefUL
#else
void fib64_sse(uint64_t *buf, uint32_t count);
typedef uint64_t buftype_t;
#define FMTx PRIx64
#define FMTu PRIu64
#define FIB_FN fib64_sse
#define CANARY 0xdeadbeefdeadc0deULL
#endif
#define xstr(s) str(s)
#define str(s) #s
int main(int argc, const char *argv[]) {
uint32_t count = 15;
if (argc > 1) {
count = atoi(argv[1]);
}
int benchmark = argc > 2;
buftype_t buf[count+1]; // allocated on the stack
// Fib overflows uint32 at count = 48, so it's not like a lot of space is useful
buf[count] = CANARY;
// uint32_t count = sizeof(buf)/sizeof(buf[0]);
if (benchmark) {
int64_t reps = 1000000000 / count;
for (int i=0 ; i<=reps ; i++)
FIB_FN(buf, count);
} else {
FIB_FN(buf, count);
for (uint32_t i ; i < count ; i++){
printf("%" FMTu " ", buf[i]);
}
putchar('\n');
}
if (buf[count] != CANARY) {
printf(xstr(FIB_FN) " wrote past the end of buf: sentinel = %" FMTx "\n", buf[count]);
}
}
Run Code Online (Sandbox Code Playgroud)
对于刚刚低于8192的计数,在我的Sandybridge i5上,展开的两个非矢量版本在每个周期的理论最大吞吐量(每周期3.5个指令)附近运行.8192*4B/int = 32768 = L1高速缓存大小.在实践中,我看到~3.3到~3.4 insn/cycle.我用Linux计算整个程序perf,不仅仅是紧密循环.
无论如何,没有任何一点可以进一步展开.显然,在count = 47之后,这已停止成为Fibonacci序列,因为我们使用uint32_t.但是,对于大型count,吞吐量受内存带宽的限制,低至~2.6 insn/cycle.在这一点上,我们基本上正在研究如何优化memset.
64位向量版本每个周期运行3个insn(每两个时钟一个128b存储),最大阵列大小约为L2高速缓存大小的1.5倍.(即./fib64 49152).随着阵列大小上升到L3缓存大小的较大部分,性能降低到每个周期~2 insn(每3个时钟一个存储),在L3缓存大小的3/4处.它在大小> L3缓存的情况下每6个周期升级到1个存储.
因此,当我们适合L2而不是L1缓存时,使用向量存储会更好.
| 归档时间: |
|
| 查看次数: |
15944 次 |
| 最近记录: |