Most performant way to subtract one array from ano

2楼-- · 2019-03-19 16:09

I was very curious about speed optimisation in this simple case. So I have made 6 simple procedures and measure CPU tick and time at array size 100000;

Pascal procedure with compiler option Range and Overflow Checking On
Pascal procedure with compiler option Range and Overflow Checking off
Classic x86 assembler procedure.
Assembler procedure with SSE instructions and unaligned 16 byte move.
Assembler procedure with SSE instructions and aligned 16 byte move.
Assembler 8 times unrolled loop with SSE instructions and aligned 16 byte move.

Check results on picture and code for more information. enter image description here

To get 16 byte memory alignment first delite the dot in file 'FastMM4Options.inc' directive {$.define Align16Bytes} !

program SubTest;

{$APPTYPE CONSOLE}

uses
//In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
//to get 16 byte memory alignment!
  FastMM4,
  windows,
  SysUtils;

var
  Ar1                   :array of integer;
  Ar2                   :array of integer;
  ArLength              :integer;
  StartTicks            :int64;
  EndTicks              :int64;
  TicksPerMicroSecond   :int64;

function GetCpuTicks: int64;
asm
  rdtsc
end;
{$R+}
{$Q+}
procedure SubArPasRangeOvfChkOn(length: integer);
var
  n: integer;
begin
  for n := 0 to length -1 do
    Ar1[n] := Ar1[n] - Ar2[n];
end;
{$R-}
{$Q-}
procedure SubArPas(length: integer);
var
  n: integer;
begin
  for n := 0 to length -1 do
    Ar1[n] := Ar1[n] - Ar2[n];
end;

procedure SubArAsm(var ar1, ar2; length: integer);
asm
//Length must be > than 0!
   push ebx
   lea  ar1, ar1 - 4
   lea  ar2, ar2 - 4
@Loop:
   mov  ebx, [ar2 + length * 4]
   sub  [ar1 + length * 4], ebx
   dec  length
   jnz  @Loop
@exit:
   pop  ebx
end;

procedure SubArAsmSimdU(var ar1, ar2; length: integer);
asm
//Prepare length
  push     length
  shr      length, 2
  jz       @Finish
@Loop:
  movdqu   xmm1, [ar1]
  movdqu   xmm2, [ar2]
  psubd    xmm1, xmm2
  movdqu   [ar1], xmm1
  add      ar1, 16
  add      ar2, 16
  dec      length
  jnz      @Loop
@Finish:
  pop      length
  push     ebx
  and      length, 3
  jz       @Exit
//Do rest, up to 3 subtractions...
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 8]
  sub      [ar1 + 8], ebx
@Exit:
  pop      ebx
end;

procedure SubArAsmSimdA(var ar1, ar2; length: integer);
asm
  push     ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @exit
  add      ar1, 8
  add      ar2, 8
//Prepare length for 16 byte data transfer
  push     length
  shr      length, 2
  jz       @Finish
@Loop:
  movdqa   xmm1, [ar1]
  movdqa   xmm2, [ar2]
  psubd    xmm1, xmm2
  movdqa   [ar1], xmm1
  add      ar1, 16
  add      ar2, 16
  dec      length
  jnz      @Loop
@Finish:
  pop      length
  and      length, 3
  jz       @Exit
//Do rest, up to 3 subtractions...
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @Exit
  mov      ebx, [ar2 + 8]
  sub      [ar1 + 8], ebx
@Exit:
  pop      ebx
end;

procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
asm
  push     ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      length
  jz       @exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      length
  jz       @exit
  add      ar1, 8                       //Align pointer to 16 byte
  add      ar2, 8                       //Align pointer to 16 byte
//Prepare length for 16 byte data transfer
  push     length
  shr      length, 5                    //8 * 4 subtructions per loop
  jz       @Finish                      //To small for LoopUnrolled
@LoopUnrolled:
//Unrolle 1, 2, 3, 4
  movdqa   xmm4, [ar2]
  movdqa   xmm5, [16 + ar2]
  movdqa   xmm6, [32 + ar2]
  movdqa   xmm7, [48 + ar2]
//
  movdqa   xmm0, [ar1]
  movdqa   xmm1, [16 + ar1]
  movdqa   xmm2, [32 + ar1]
  movdqa   xmm3, [48 + ar1]
//
  psubd    xmm0, xmm4
  psubd    xmm1, xmm5
  psubd    xmm2, xmm6
  psubd    xmm3, xmm7
//
  movdqa   [48 + ar1], xmm3
  movdqa   [32 + ar1], xmm2
  movdqa   [16 + ar1], xmm1
  movdqa   [ar1], xmm0
//Unrolle 5, 6, 7, 8
  movdqa   xmm4, [64 + ar2]
  movdqa   xmm5, [80 + ar2]
  movdqa   xmm6, [96 + ar2]
  movdqa   xmm7, [112 + ar2]
//
  movdqa   xmm0, [64 + ar1]
  movdqa   xmm1, [80 + ar1]
  movdqa   xmm2, [96 + ar1]
  movdqa   xmm3, [112 + ar1]
//
  psubd    xmm0, xmm4
  psubd    xmm1, xmm5
  psubd    xmm2, xmm6
  psubd    xmm3, xmm7
//
  movdqa   [112 + ar1], xmm3
  movdqa   [96 + ar1], xmm2
  movdqa   [80 + ar1], xmm1
  movdqa   [64 + ar1], xmm0
//
  add      ar1, 128
  add      ar2, 128
  dec      length
  jnz      @LoopUnrolled
@FinishUnrolled:
  pop      length
  and      length, $1F
//Do rest, up to 31 subtractions...
@Finish:
  mov      ebx, [ar2]
  sub      [ar1], ebx
  add      ar1, 4
  add      ar2, 4
  dec      length
  jnz      @Finish
@Exit:
  pop      ebx
end;

procedure WriteOut(EndTicks: Int64; Str: string);
begin
  WriteLn(Str + IntToStr(EndTicks - StartTicks)
    + ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
  Sleep(5);
  SwitchToThread;
  StartTicks := GetCpuTicks;
end;

begin
  ArLength := 100000;
//Set TicksPerMicroSecond
  QueryPerformanceFrequency(TicksPerMicroSecond);
  TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
//
  SetLength(Ar1, ArLength);
  SetLength(Ar2, ArLength);
//Fill arrays
//...
//Tick time info
  WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
  Sleep(5);
  SwitchToThread;
  StartTicks := GetCpuTicks;
//Test 1
  SubArPasRangeOvfChkOn(ArLength);
  WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
//Test 2
  SubArPas(ArLength);
  WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
//Test 3
  SubArAsm(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
//Test 4
  SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
//Test 5
  SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
//Test 6
  SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
  WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
//
  ReadLn;
  Ar1 := nil;
  Ar2 := nil;
end.

...

The fastest asm procedure with 8 times unrolled SIMD instructions takes only 68us and is about 4 time faster than Pascal procedure.

As we can see the Pascal loop procedure probably isn't critical, it takes only about 277us (Overflow and Range checking off) on 2,4GHz CPU at 100000 subtractions.

So this code can't be bottleneck?

0人赞添加讨论(0) 举报

够拽才男人

3楼-- · 2019-03-19 16:23

It's not a real answer to your question, but I would investigate if I could do the subtraction already at some time while filling the arrays with values. I would optionally even consider a third array in memory to store the result of the subtraction. In modern computing, the 'cost' of memory is considerably lower than the 'cost' of the time it takes to perform an extra action on memory.

In theory you'll gain at least a little performance when the subtraction can be done while the values are still in registers or processor cache, but in practice you just might stumble upon a few tricks that could enhance performance of the entire algorithm.

0人赞添加讨论(0) 举报

男人必须洒脱

4楼-- · 2019-03-19 16:26

I'm not assembly expert but I think the following are near optimal if you don't take into account SIMD instructions or parallel processing, the later can be easily accomplished by passing portions of the array to the function.

like
Thread1: SubArray(ar1[0], ar2[0], 50);
Thread2: SubArray(ar1[50], ar2[50], 50);

procedure SubArray(var Array1, Array2; const Length: Integer);
var
  ap1, ap2 : PInteger;
  i : Integer;
begin
  ap1 := @Array1;
  ap2 := @Array2;
  i := Length;
  while i > 0 do
  begin
    ap1^ := ap1^ - ap2^;
    Inc(ap1);
    Inc(ap2);
    Dec(i);
  end;
end;

// similar assembly version
procedure SubArrayEx(var Array1, Array2; const Length: Integer);
asm
  // eax = @Array1
  // edx = @Array2
  // ecx = Length
  // esi = temp register for array2^
  push esi
  cmp ecx, 0
  jle @Exit
  @Loop:
  mov esi, [edx]
  sub [eax], esi
  add eax, 4
  add edx, 4
  dec ecx
  jnz @Loop
  @Exit:
  pop esi
end;


procedure Test();
var
  a1, a2 : array of Integer;
  i : Integer;
begin
  SetLength(a1, 3);
  a1[0] := 3;
  a1[1] := 1;
  a1[2] := 2;
  SetLength(a2, 3);
  a2[0] := 4;
  a2[1] := 21;
  a2[2] := 2;
  SubArray(a1[0], a2[0], Length(a1));

  for i := 0 to Length(a1) - 1 do
    Writeln(a1[i]);

  Readln;
end;

0人赞添加讨论(0) 举报

SAY GOODBYE

5楼-- · 2019-03-19 16:31

Running this on multiple threads, with that big an array will net linear speed-up. It's embarrassingly parallel as they say.

0人赞添加讨论(0) 举报

来，给爷笑一个

6楼-- · 2019-03-19 16:34

Running subtraction on more threads sounds good, but 100K integer sunstraction don't take a lot of CPU time, so maybe threadpool... However settings threads have also a lot of overhead, so short arrays will have slower productivity in parallel threads than in only one (main) thread!

Did you switch off in compiler settings, overflow and range checking?

You can try to use asm rutine, it is very simple...

Something like:

procedure SubArray(var ar1, ar2; length: integer);
asm
//length must be > than 0!
   push ebx
   lea  ar1, ar1 -4
   lea  ar2, ar2 -4
@Loop:
   mov  ebx, [ar2 + length *4]
   sub  [ar1 + length *4], ebx
   dec  length
//Here you can put more folloving parts of rutine to more unrole it to speed up.
   jz   @exit
   mov  ebx, [ar2 + length *4]
   sub  [ar1 + length *4], ebx
   dec  length
//
   jnz  @Loop
@exit:
   pop  ebx
end;


begin
   SubArray(Array1[0], Array2[0], length(Array1));

It can be much faster...

EDIT: Added procedure with SIMD instructions. This procedure request SSE CPU support. It can take 4 integers in XMM register and subtract at once. There is also possibility to use movdqa instead movdqu it is faster, but you must first to ensure 16 byte aligment. You can also unrole the XMM par like in my first asm case. (I'm interesting about speed measurment. :) )

var
  array1, array2: array of integer;

procedure SubQIntArray(var ar1, ar2; length: integer);
asm
//prepare length if not rounded to 4
  push     ecx
  shr      length, 2
  jz       @LengthToSmall
@Loop:
  movdqu   xmm1, [ar1]          //or movdqa but ensure 16b aligment first
  movdqu   xmm2, [ar2]          //or movdqa but ensure 16b aligment first
  psubd    xmm1, xmm2
  movdqu   [ar1], xmm1          //or movdqa but ensure 16b aligment first
  add      ar1, 16
  add      ar2, 16
  dec      length
  jnz      @Loop
@LengthToSmall:
  pop      ecx
  push     ebx
  and      ecx, 3
  jz       @Exit
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      ecx
  jz       @Exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      ecx
  jz       @Exit
  mov      ebx, [ar2 + 8]
  sub      [ar1 + 8], ebx
@Exit:
  pop      ebx
end;

begin
//Fill arrays first!
  SubQIntArray(Array1[0], Array2[0], length(Array1));

0人赞添加讨论(0) 举报

Most performant way to subtract one array from ano

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间