I have the following code which is the bottleneck in one part of my application. All I do is subtract on Array from another. Both of these arrays have more around 100000 elements. I'm trying to find a way to make this more performant.
var
Array1, Array2 : array of integer;
.....
// Code that fills the arrays
.....
for ix := 0 to length(array1)-1
Array1[ix] := Array1[ix] - Array2[ix];
end;
Does anybody have a suggestion?
I was very curious about speed optimisation in this simple case.
So I have made 6 simple procedures and measure CPU tick and time at array size 100000;
- Pascal procedure with compiler option Range and Overflow Checking On
- Pascal procedure with compiler option Range and Overflow Checking off
- Classic x86 assembler procedure.
- Assembler procedure with SSE instructions and unaligned 16 byte move.
- Assembler procedure with SSE instructions and aligned 16 byte move.
- Assembler 8 times unrolled loop with SSE instructions and aligned 16 byte move.
Check results on picture and code for more information.
To get 16 byte memory alignment first delite the dot in file 'FastMM4Options.inc' directive {$.define Align16Bytes}
!
program SubTest;
{$APPTYPE CONSOLE}
uses
//In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
//to get 16 byte memory alignment!
FastMM4,
windows,
SysUtils;
var
Ar1 :array of integer;
Ar2 :array of integer;
ArLength :integer;
StartTicks :int64;
EndTicks :int64;
TicksPerMicroSecond :int64;
function GetCpuTicks: int64;
asm
rdtsc
end;
{$R+}
{$Q+}
procedure SubArPasRangeOvfChkOn(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
{$R-}
{$Q-}
procedure SubArPas(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
procedure SubArAsm(var ar1, ar2; length: integer);
asm
//Length must be > than 0!
push ebx
lea ar1, ar1 - 4
lea ar2, ar2 - 4
@Loop:
mov ebx, [ar2 + length * 4]
sub [ar1 + length * 4], ebx
dec length
jnz @Loop
@exit:
pop ebx
end;
procedure SubArAsmSimdU(var ar1, ar2; length: integer);
asm
//Prepare length
push length
shr length, 2
jz @Finish
@Loop:
movdqu xmm1, [ar1]
movdqu xmm2, [ar2]
psubd xmm1, xmm2
movdqu [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@Finish:
pop length
push ebx
and length, 3
jz @Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
procedure SubArAsmSimdA(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @exit
add ar1, 8
add ar2, 8
//Prepare length for 16 byte data transfer
push length
shr length, 2
jz @Finish
@Loop:
movdqa xmm1, [ar1]
movdqa xmm2, [ar2]
psubd xmm1, xmm2
movdqa [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@Finish:
pop length
and length, 3
jz @Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @exit
add ar1, 8 //Align pointer to 16 byte
add ar2, 8 //Align pointer to 16 byte
//Prepare length for 16 byte data transfer
push length
shr length, 5 //8 * 4 subtructions per loop
jz @Finish //To small for LoopUnrolled
@LoopUnrolled:
//Unrolle 1, 2, 3, 4
movdqa xmm4, [ar2]
movdqa xmm5, [16 + ar2]
movdqa xmm6, [32 + ar2]
movdqa xmm7, [48 + ar2]
//
movdqa xmm0, [ar1]
movdqa xmm1, [16 + ar1]
movdqa xmm2, [32 + ar1]
movdqa xmm3, [48 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [48 + ar1], xmm3
movdqa [32 + ar1], xmm2
movdqa [16 + ar1], xmm1
movdqa [ar1], xmm0
//Unrolle 5, 6, 7, 8
movdqa xmm4, [64 + ar2]
movdqa xmm5, [80 + ar2]
movdqa xmm6, [96 + ar2]
movdqa xmm7, [112 + ar2]
//
movdqa xmm0, [64 + ar1]
movdqa xmm1, [80 + ar1]
movdqa xmm2, [96 + ar1]
movdqa xmm3, [112 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [112 + ar1], xmm3
movdqa [96 + ar1], xmm2
movdqa [80 + ar1], xmm1
movdqa [64 + ar1], xmm0
//
add ar1, 128
add ar2, 128
dec length
jnz @LoopUnrolled
@FinishUnrolled:
pop length
and length, $1F
//Do rest, up to 31 subtractions...
@Finish:
mov ebx, [ar2]
sub [ar1], ebx
add ar1, 4
add ar2, 4
dec length
jnz @Finish
@Exit:
pop ebx
end;
procedure WriteOut(EndTicks: Int64; Str: string);
begin
WriteLn(Str + IntToStr(EndTicks - StartTicks)
+ ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
end;
begin
ArLength := 100000;
//Set TicksPerMicroSecond
QueryPerformanceFrequency(TicksPerMicroSecond);
TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
//
SetLength(Ar1, ArLength);
SetLength(Ar2, ArLength);
//Fill arrays
//...
//Tick time info
WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
//Test 1
SubArPasRangeOvfChkOn(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
//Test 2
SubArPas(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
//Test 3
SubArAsm(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
//Test 4
SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
//Test 5
SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
//Test 6
SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
//
ReadLn;
Ar1 := nil;
Ar2 := nil;
end.
...
The fastest asm procedure with 8 times unrolled SIMD instructions takes only 68us and is about 4 time faster than Pascal procedure.
As we can see the Pascal loop procedure probably isn't critical, it takes only about 277us (Overflow and Range checking off) on 2,4GHz CPU at 100000 subtractions.
So this code can't be bottleneck?
Running subtraction on more threads sounds good, but 100K integer sunstraction don't take a lot of CPU time, so maybe threadpool... However settings threads have also a lot of overhead, so short arrays will have slower productivity in parallel threads than in only one (main) thread!
Did you switch off in compiler settings, overflow and range checking?
You can try to use asm rutine, it is very simple...
Something like:
procedure SubArray(var ar1, ar2; length: integer);
asm
//length must be > than 0!
push ebx
lea ar1, ar1 -4
lea ar2, ar2 -4
@Loop:
mov ebx, [ar2 + length *4]
sub [ar1 + length *4], ebx
dec length
//Here you can put more folloving parts of rutine to more unrole it to speed up.
jz @exit
mov ebx, [ar2 + length *4]
sub [ar1 + length *4], ebx
dec length
//
jnz @Loop
@exit:
pop ebx
end;
begin
SubArray(Array1[0], Array2[0], length(Array1));
It can be much faster...
EDIT: Added procedure with SIMD instructions.
This procedure request SSE CPU support. It can take 4 integers in XMM register and subtract at once. There is also possibility to use movdqa
instead movdqu
it is faster, but you must first to ensure 16 byte aligment. You can also unrole the XMM par like in my first asm case. (I'm interesting about speed measurment. :) )
var
array1, array2: array of integer;
procedure SubQIntArray(var ar1, ar2; length: integer);
asm
//prepare length if not rounded to 4
push ecx
shr length, 2
jz @LengthToSmall
@Loop:
movdqu xmm1, [ar1] //or movdqa but ensure 16b aligment first
movdqu xmm2, [ar2] //or movdqa but ensure 16b aligment first
psubd xmm1, xmm2
movdqu [ar1], xmm1 //or movdqa but ensure 16b aligment first
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@LengthToSmall:
pop ecx
push ebx
and ecx, 3
jz @Exit
mov ebx, [ar2]
sub [ar1], ebx
dec ecx
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec ecx
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
begin
//Fill arrays first!
SubQIntArray(Array1[0], Array2[0], length(Array1));
I'm not assembly expert but I think the following are near optimal if you don't take into account SIMD instructions or parallel processing, the later can be easily accomplished by passing portions of the array to the function.
like
Thread1: SubArray(ar1[0], ar2[0], 50);
Thread2: SubArray(ar1[50], ar2[50], 50);
procedure SubArray(var Array1, Array2; const Length: Integer);
var
ap1, ap2 : PInteger;
i : Integer;
begin
ap1 := @Array1;
ap2 := @Array2;
i := Length;
while i > 0 do
begin
ap1^ := ap1^ - ap2^;
Inc(ap1);
Inc(ap2);
Dec(i);
end;
end;
// similar assembly version
procedure SubArrayEx(var Array1, Array2; const Length: Integer);
asm
// eax = @Array1
// edx = @Array2
// ecx = Length
// esi = temp register for array2^
push esi
cmp ecx, 0
jle @Exit
@Loop:
mov esi, [edx]
sub [eax], esi
add eax, 4
add edx, 4
dec ecx
jnz @Loop
@Exit:
pop esi
end;
procedure Test();
var
a1, a2 : array of Integer;
i : Integer;
begin
SetLength(a1, 3);
a1[0] := 3;
a1[1] := 1;
a1[2] := 2;
SetLength(a2, 3);
a2[0] := 4;
a2[1] := 21;
a2[2] := 2;
SubArray(a1[0], a2[0], Length(a1));
for i := 0 to Length(a1) - 1 do
Writeln(a1[i]);
Readln;
end;
It's not a real answer to your question, but I would investigate if I could do the subtraction already at some time while filling the arrays with values. I would optionally even consider a third array in memory to store the result of the subtraction. In modern computing, the 'cost' of memory is considerably lower than the 'cost' of the time it takes to perform an extra action on memory.
In theory you'll gain at least a little performance when the subtraction can be done while the values are still in registers or processor cache, but in practice you just might stumble upon a few tricks that could enhance performance of the entire algorithm.