[fpc-devel] LEA instruction speed

J. Gareth Moreton gareth at moreton-family.com
Sun Oct 8 13:45:04 CEST 2023


Sorry, ignore last attachment - I forgot to change a line of assembly 
(it was correct for x86_64-win64!!). Here is the corrected version.

Kit

On 08/10/2023 12:38, J. Gareth Moreton via fpc-devel wrote:
> Sorry, I got careless and was in a rush, as both the Pascal code is 
> wrong and I didn't store the result of the benchmark test, hence the 
> error check at the end returned a false negative.
>
> The benchmark code was from Rika's SHA-1 test code, which I didn't 
> properly check, although I assumed the logic was to avoid counting the 
> time of the internal loop as much as possible.  I should have gone 
> with my gut instinct and realised that wasn't the best method.
>
> I've attached the updated test (now called "blea" as it's a benchmark 
> test) with your suggestions implemented, and an improved benchmarking 
> system.  I'm not used to specifying parameters in place of registers - 
> I'm too used to needing total control!
>
> Your results from experiments with adding additional ADD instructions 
> is expected, as LEA uses an AGU for computation, leaving the ALUs free 
> for other tasks (like ADD), so LEA is better even if speed is equal.
>
> Kit
>
> On 08/10/2023 11:06, Marģers . via fpc-devel wrote:
>> 1. why you leave "time:=..." in benchmark loop? It does add 50% of 
>> execution time per call.
>> 2. Pascal version does not match assembler version. Had to fix it.
>>       //Result := X + Counter + $87654321;
>>       Result:=Result + X + $87654321;
>>       Result:=Result xor y;
>> 3. Assembler functions can be unified to work under win64,win32, 
>> linux 64, linux 32
>> function Checksum_LEA(const Input, X, Y: LongWord): LongWord; 
>> assembler; nostackframe;
>> asm
>> @Loop2:
>>   LEA Input, [Input + X + $87654321]
>>   XOR Input, y
>>   DEC y
>>   JNZ @Loop2
>>   MOV EAX, Input
>> end;
>>
>> 4. My results. Ryzen 2700x
>>
>>    Pascal control case: 0.7 ns/call  0.0710
>>  Using LEA instruction: 0.7 ns/call  0.0700
>> Using ADD instructions: 0.7 ns/call  0.0710
>>
>> Even thou results are equal, i was able to add 4 independent ADD 
>> instructions around LEA while results didn't chance, but only 2 
>> around ADD.
>>
>> _______________________________________________
>> fpc-devel maillist  -  fpc-devel at lists.freepascal.org
>> https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel
>
> _______________________________________________
> fpc-devel maillist  -  fpc-devel at lists.freepascal.org
> https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel
-------------- next part --------------
{ %CPU=i386,x86_64 }
program blea;

{$IF not defined(CPUX86) and not defined(CPUX86_64)}
  {$FATAL This test program requires an Intel x86 or x64 processor }
{$ENDIF}

{$MODE OBJFPC}
{$ASMMODE Intel}

uses
  SysUtils;
  
type
  TBenchmarkProc = function(const Input, X, Y: LongWord): LongWord;
 

function Checksum_PAS(const Input, X, Y: LongWord): LongWord;
var
  Counter: LongWord;
begin
  Result := Input;
  Counter := Y;
  while (Counter > 0) do
    begin
      Result := Result + X + $87654321;
      Result := Result xor Counter;
      Dec(Counter);
    end;
end;

function Checksum_ADD(const Input, X, Y: LongWord): LongWord; assembler; nostackframe;
asm
@Loop1:
  ADD Input, $87654321
  ADD Input, X
  XOR Input, Y
  DEC Y
  JNZ @Loop1
  MOV Result, Input
end;

function Checksum_LEA(const Input, X, Y: LongWord): LongWord; assembler; nostackframe;
asm
@Loop2:
  LEA Input, [Input + X + $87654321]
  XOR Input, Y
  DEC Y
  JNZ @Loop2
  MOV Result, Input
end;

function Benchmark(const name: string; proc: TBenchmarkProc; Z, X: LongWord): LongWord;
const
  internal_reps = 1000;
var
  start: TDateTime;
  time: double;
  reps: cardinal;
begin
  Result := Z;
  reps := 0;
  start := Now;
  repeat
    inc(reps);
    Result := proc(Result, X, internal_reps);
  until (reps >= 10000);
  time := ((Now - start) * SecsPerDay) / reps / internal_reps * 1e9;
  writeln(name, ': ', time:0:ord(time < 10), ' ns/call');
end;

var
  Results: array[0..2] of LongWord;
  FailureCode: Integer;
begin
  Results[0] := Benchmark('   Pascal control case', @Checksum_PAS, 5000000, 1000);
  Results[1] := Benchmark(' Using LEA instruction', @Checksum_LEA, 5000000, 1000);
  Results[2] := Benchmark('Using ADD instructions', @Checksum_ADD, 5000000, 1000);
  
  FailureCode := 0;

  if (Results[0] <> Results[1]) then
    begin
      WriteLn('ERROR: Checksum_LEA doesn''t match control case');
      FailureCode := FailureCode or 1;
    end;
  if (Results[0] <> Results[2]) then
    begin
      WriteLn('ERROR: Checksum_ADD doesn''t match control case');
      FailureCode := FailureCode or 2
    end;
    
  if FailureCode <> 0 then
    Halt(FailureCode);
end.


More information about the fpc-devel mailing list