从一个数组中减去另一个数组的最佳方法

Mic*_*ler 8 delphi performance x86 sse

我有以下代码,这是我的应用程序的一部分的瓶颈.我所做的就是从另一个数据中减去Array.这两个阵列都有大约100000个元素.我正试图找到一种方法来使这更高效.

var
  Array1, Array2 : array of integer;

..... 
// Code that fills the arrays
.....

for ix := 0 to length(array1)-1
  Array1[ix] := Array1[ix] - Array2[ix];

end;
Run Code Online (Sandbox Code Playgroud)

有人有建议吗?

Dav*_*nan 9

在多个线程上运行它,使用那个大的数组将净线性加速.他们说,这是令人尴尬的平行.


GJ.*_*GJ. 9

在更多线程上运行减法听起来不错,但100K整数sunstraction不占用大量CPU时间,因此可能是threadpool ...但是设置线程也有很多开销,因此短数组的并行线程生产率会比只有一个(主)线程!

你在编译器设置,溢出和范围检查中关闭了吗?

你可以尝试使用asm rutine,它非常简单......

就像是:

procedure SubArray(var ar1, ar2; length: integer);
asm
//length must be > than 0!
   push ebx
   lea  ar1, ar1 -4
   lea  ar2, ar2 -4
@Loop:
   mov  ebx, [ar2 + length *4]
   sub  [ar1 + length *4], ebx
   dec  length
//Here you can put more folloving parts of rutine to more unrole it to speed up.
   jz   @exit
   mov  ebx, [ar2 + length *4]
   sub  [ar1 + length *4], ebx
   dec  length
//
   jnz  @Loop
@exit:
   pop  ebx
end;


begin
   SubArray(Array1[0], Array2[0], length(Array1));
Run Code Online (Sandbox Code Playgroud)

它可以快得多......

编辑:使用SIMD说明添加了程序. 此过程请求SSE CPU支持.它可以在XMM寄存器中取4个整数并一次减去.也有可能使用movdqa,而不是movdqu它是速度较快,但必须先确保16字节aligment.您也可以像我的第一个asm案例一样解除XMM标准.(我对速度测量感兴趣.:))

var
  array1, array2: array of integer;

procedure SubQIntArray(var ar1, ar2; length: integer);
asm
//prepare length if not rounded to 4
  push     ecx
  shr      length, 2
  jz       @LengthToSmall
@Loop:
  movdqu   xmm1, [ar1]          //or movdqa but ensure 16b aligment first
  movdqu   xmm2, [ar2]          //or movdqa but ensure 16b aligment first
  psubd    xmm1, xmm2
  movdqu   [ar1], xmm1          //or movdqa but ensure 16b aligment first
  add      ar1, 16
  add      ar2, 16
  dec      length
  jnz      @Loop
@LengthToSmall:
  pop      ecx
  push     ebx
  and      ecx, 3
  jz       @Exit
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      ecx
  jz       @Exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      ecx
  jz       @Exit
  mov      ebx, [ar2 + 8]
  sub      [ar1 + 8], ebx
@Exit:
  pop      ebx
end;

begin
//Fill arrays first!
  SubQIntArray(Array1[0], Array2[0], length(Array1));
Run Code Online (Sandbox Code Playgroud)


GJ.*_*GJ. 5

在这个简单的案例中,我对速度优化非常好奇.所以我制作了6个简单的程序,测量数据块大小为100000的CPU时钟和时间;

  1. 带编译器选项Range和Overflow Check On的Pascal过程
  2. 带编译器选项Range和Overflow的Pascal过程检查关闭
  3. 经典的x86汇编程序.
  4. 具有SSE指令和未对齐的16字节移动的汇编程序.
  5. 具有SSE指令的汇编程序和对齐的16字节移动.
  6. 汇编程序8次展开循环,带有SSE指令并对齐16字节移动.

检查图片和代码的结果以获取更多信息. 在此输入图像描述

要获得16字节的内存对齐,首先在文件'FastMM4Options.inc'指令{$ .define Align16Bytes}中加入点!

program SubTest;

{$APPTYPE CONSOLE}

uses
//In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
//to get 16 byte memory alignment!
  FastMM4,
  windows,
  SysUtils;

var
  Ar1                   :array of integer;
  Ar2                   :array of integer;
  ArLength              :integer;
  StartTicks            :int64;
  EndTicks              :int64;
  TicksPerMicroSecond   :int64;

function GetCpuTicks: int64;
asm
  rdtsc
end;
{$R+}
{$Q+}
procedure SubArPasRangeOvfChkOn(length: integer);
var
  n: integer;
begin
  for n := 0 to length -1 do
    Ar1[n] := Ar1[n] - Ar2[n];
end;
{$R-}
{$Q-}
procedure SubArPas(length: integer);
var
  n: integer;
begin
  for n := 0 to length -1 do
    Ar1[n] := Ar1[n] - Ar2[n];
end;

procedure SubArAsm(var ar1, ar2; length: integer);
asm
//Length must be > than 0!
   push ebx
   lea  ar1, ar1 - 4
   lea  ar2, ar2 - 4
@Loop:
   mov  ebx, [ar2 + length * 4]
   sub  [ar1 + length * 4], ebx
   dec  length
   jnz  @Loop
@exit:
   pop  ebx
end;

procedure SubArAsmSimdU(var ar1, ar2; length: integer);
asm
//Prepare length
  push     length
  shr      length, 2
  jz       @Finish
@Loop:
  movdqu   xmm1, [ar1]
  movdqu   xmm2, [ar2]
  psubd    xmm1, xmm2
  movdqu   [ar1], xmm1
  add      ar1, 16
  add      ar2, 16
  dec      length
  jnz      @Loop
@Finish:
  pop      length
  push     ebx
  and      length, 3
  jz       @Exit
//Do rest, up to 3 subtractions...
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 8]
  sub      [ar1 + 8], ebx
@Exit:
  pop      ebx
end;

procedure SubArAsmSimdA(var ar1, ar2; length: integer);
asm
  push     ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @exit
  add      ar1, 8
  add      ar2, 8
//Prepare length for 16 byte data transfer
  push     length
  shr      length, 2
  jz       @Finish
@Loop:
  movdqa   xmm1, [ar1]
  movdqa   xmm2, [ar2]
  psubd    xmm1, xmm2
  movdqa   [ar1], xmm1
  add      ar1, 16
  add      ar2, 16
  dec      length
  jnz      @Loop
@Finish:
  pop      length
  and      length, 3
  jz       @Exit
//Do rest, up to 3 subtractions...
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 8]
  sub      [ar1 + 8], ebx
@Exit:
  pop      ebx
end;

procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
asm
  push     ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @exit
  add      ar1, 8                       //Align pointer to 16 byte
  add      ar2, 8                       //Align pointer to 16 byte
//Prepare length for 16 byte data transfer
  push     length
  shr      length, 5                    //8 * 4 subtructions per loop
  jz       @Finish                      //To small for LoopUnrolled
@LoopUnrolled:
//Unrolle 1, 2, 3, 4
  movdqa   xmm4, [ar2]
  movdqa   xmm5, [16 + ar2]
  movdqa   xmm6, [32 + ar2]
  movdqa   xmm7, [48 + ar2]
//
  movdqa   xmm0, [ar1]
  movdqa   xmm1, [16 + ar1]
  movdqa   xmm2, [32 + ar1]
  movdqa   xmm3, [48 + ar1]
//
  psubd    xmm0, xmm4
  psubd    xmm1, xmm5
  psubd    xmm2, xmm6
  psubd    xmm3, xmm7
//
  movdqa   [48 + ar1], xmm3
  movdqa   [32 + ar1], xmm2
  movdqa   [16 + ar1], xmm1
  movdqa   [ar1], xmm0
//Unrolle 5, 6, 7, 8
  movdqa   xmm4, [64 + ar2]
  movdqa   xmm5, [80 + ar2]
  movdqa   xmm6, [96 + ar2]
  movdqa   xmm7, [112 + ar2]
//
  movdqa   xmm0, [64 + ar1]
  movdqa   xmm1, [80 + ar1]
  movdqa   xmm2, [96 + ar1]
  movdqa   xmm3, [112 + ar1]
//
  psubd    xmm0, xmm4
  psubd    xmm1, xmm5
  psubd    xmm2, xmm6
  psubd    xmm3, xmm7
//
  movdqa   [112 + ar1], xmm3
  movdqa   [96 + ar1], xmm2
  movdqa   [80 + ar1], xmm1
  movdqa   [64 + ar1], xmm0
//
  add      ar1, 128
  add      ar2, 128
  dec      length
  jnz      @LoopUnrolled
@FinishUnrolled:
  pop      length
  and      length, $1F
//Do rest, up to 31 subtractions...
@Finish:
  mov      ebx, [ar2]
  sub      [ar1], ebx
  add      ar1, 4
  add      ar2, 4
  dec      length
  jnz      @Finish
@Exit:
  pop      ebx
end;

procedure WriteOut(EndTicks: Int64; Str: string);
begin
  WriteLn(Str + IntToStr(EndTicks - StartTicks)
    + ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
  Sleep(5);
  SwitchToThread;
  StartTicks := GetCpuTicks;
end;

begin
  ArLength := 100000;
//Set TicksPerMicroSecond
  QueryPerformanceFrequency(TicksPerMicroSecond);
  TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
//
  SetLength(Ar1, ArLength);
  SetLength(Ar2, ArLength);
//Fill arrays
//...
//Tick time info
  WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
  Sleep(5);
  SwitchToThread;
  StartTicks := GetCpuTicks;
//Test 1
  SubArPasRangeOvfChkOn(ArLength);
  WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
//Test 2
  SubArPas(ArLength);
  WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
//Test 3
  SubArAsm(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
//Test 4
  SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
//Test 5
  SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
//Test 6
  SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
//
  ReadLn;
  Ar1 := nil;
  Ar2 := nil;
end.
Run Code Online (Sandbox Code Playgroud)

...

具有8次展开的SIMD指令的最快的asm过程仅花费68us并且比Pascal过程快大约4倍.

我们可以看到Pascal循环过程可能并不重要,它在2,4000 CPU上仅需要大约277us(溢出和范围检查),减去100000次.

那么这段代码不能成为瓶颈?