[fpc-devel] LEA instruction speed

J. Gareth Moreton gareth at moreton-family.com
Tue Oct 10 11:13:12 CEST 2023


Thanks Tomas,

Nothing is broken, but the timing measurement isn't precise enough.

Normally I have a much higher iteration count (e.g. 1,000,000), but I 
had reduced it to 10,000 because, coupled with the 1,000 iterations in 
the subroutines themselves, would have led to 1,000,000,000 passes and 
hence would take in the region of five to ten minutes to complete for a 
16 MHz 386, for example.  Rika's suggestion of running as many 
iterations as needed until, say, 5 seconds elapses, would help but the 
timing measurements would cause a lot of latency and will be imprecise 
on very slow routines.  Still, let's see if 100,000 gives better results 
for you.

Kit

On 10/10/2023 09:57, Tomas Hajny wrote:
> On 2023-10-09 20:51, J. Gareth Moreton via fpc-devel wrote:
>
>
> Hi Kit,
>
>> I updated the "blea" test in the merge request so it now displays the
>> processor brand name on x86_64; however, it is not fetched under i386
>> because CPUID was not introduced until later 486 processors. I've
>> attached it to this e-mail if anyone wants to take a look to ensure I
>> haven't broken something.
>
> I don't know what's broken, but the results vary so much on a fast 
> machine that they are unusable for any measurement from my point of 
> view (standard 3.2.2 compiler, compiled with -O4 and running under MS 
> Windows this time). Sometimes the ADD version shows 0.0 ns/call, 
> sometimes the LEA version shows 0.0 ns/call (32-bits) or 0.1 ns/call 
> (64-bits). See the attached results (the CPU is only displayed for the 
> 64-bit compilation, but it's obviously the same CPU).
>
> Tomas
>
>
>>
>> On 09/10/2023 18:01, J. Gareth Moreton via fpc-devel wrote:
>>> Thank you very much!  That processor is built on the Excavator 
>>> architecture and lines up with the flag I put in the merge request 
>>> (i.e. it has the "fast LEA" hint).
>>>
>>> I honestly didn't expect this much testing feedback, so thank you all!
>>>
>>> Gareth aka. Kit
>>>
>>> P.S. I'm tempted to extend the test slightly to actually name the 
>>> CPU automatically.
>>>
>>> On 09/10/2023 15:40, Jean SUZINEAU via fpc-devel wrote:
>>>> My results:
>>>> jean at First-Boss:~/temp$ cat /proc/cpuinfo | grep "model name"
>>>> model name    : AMD A6-7480 Radeon R5, 8 Compute Cores 2C+6G
>>>> jean at First-Boss:~/temp$ /usr/bin/fpc blea.pp
>>>> Free Pascal Compiler version 3.2.2 [2021/07/09] for x86_64
>>>> Copyright (c) 1993-2021 by Florian Klaempfl and others
>>>> Target OS: Linux for x86-64
>>>> Compiling blea.pp
>>>> Linking blea
>>>> 95 lines compiled, 0.2 sec
>>>> jean at First-Boss:~/temp$ ./blea
>>>>    Pascal control case: 5.1 ns/call
>>>>  Using LEA instruction: 0.5 ns/call
>>>> Using ADD instructions: 0.8 ns/call
>>>> jean at First-Boss:~/temp$
>>>>
>>>> _______________________________________________
>>>> fpc-devel maillist  -  fpc-devel at lists.freepascal.org
>>>> https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel
>>>>
>>> _______________________________________________
>>> fpc-devel maillist  -  fpc-devel at lists.freepascal.org
>>> https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel
>>>
>> _______________________________________________
>> fpc-devel maillist  -  fpc-devel at lists.freepascal.org
>> https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel
-------------- next part --------------
{ %CPU=i386,x86_64 }
program blea;

{$IF not defined(CPUX86) and not defined(CPUX86_64)}
  {$FATAL This test program requires an Intel x86 or x64 processor }
{$ENDIF}

{$MODE OBJFPC}
{$ASMMODE Intel}

uses
  SysUtils;
  
type
  TBenchmarkProc = function(const Input, X, Y: LongWord): LongWord;

var
  CPUName: array[0..48] of Char;

{$ifdef CPUX86_64}
function FillBrandName: Boolean; assembler; nostackframe;
asm
  PUSH RBX
  MOV  EAX, $80000000
  CPUID
  CMP  EAX, $80000004
  JB   @Unavailable
  LEA  R8,  [RIP + CPUName]
  MOV  EAX, $80000002
  CPUID
  MOV  [R8], EAX
  MOV  [R8 + 4], EBX
  MOV  [R8 + 8], ECX
  MOV  [R8 + 12], EDX
  MOV  EAX, $80000003
  CPUID
  MOV  [R8 + 16], EAX
  MOV  [R8 + 20], EBX
  MOV  [R8 + 24], ECX
  MOV  [R8 + 28], EDX
  MOV  EAX, $80000004
  CPUID
  MOV  [R8 + 32], EAX
  MOV  [R8 + 36], EBX
  MOV  [R8 + 40], ECX
  MOV  [R8 + 44], EDX
  MOV  BYTE PTR [R8 + 48], 0
  MOV  AL,  1
  JMP  @ExitBrand
@Unavailable:
  XOR  AL,  AL
@ExitBrand:
  POP  RBX
end;
{$else CPUX86_64}
function FillBrandName: Boolean; inline;
begin
  Result := False;
end;
{$endif CPUX86_64}

function Checksum_PAS(const Input, X, Y: LongWord): LongWord;
var
  Counter: LongWord;
begin
  Result := Input;
  Counter := Y;
  while (Counter > 0) do
    begin
      Result := Result + X + $87654321;
      Result := Result xor Counter;
      Dec(Counter);
    end;
end;

function Checksum_ADD(const Input, X, Y: LongWord): LongWord; assembler; nostackframe;
asm
@Loop1:
  ADD Input, $87654321
  ADD Input, X
  XOR Input, Y
  DEC Y
  JNZ @Loop1
  MOV Result, Input
end;

function Checksum_LEA(const Input, X, Y: LongWord): LongWord; assembler; nostackframe;
asm
@Loop2:
  LEA Input, [Input + X + $87654321]
  XOR Input, Y
  DEC Y
  JNZ @Loop2
  MOV Result, Input
end;

function Benchmark(const name: string; proc: TBenchmarkProc; Z, X: LongWord): LongWord;
const
  internal_reps = 1000;
var
  start: TDateTime;
  time: double;
  reps: cardinal;
begin
  Result := Z;
  reps := 0;
  Write(name, ': ');
  start := Now;
  repeat
    inc(reps);
    Result := proc(Result, X, internal_reps);
  until (reps >= 100000);
  time := ((Now - start) * SecsPerDay) / reps / internal_reps * 1e9;
  WriteLn(time:0:(2 * ord(time < 10)), ' ns/call');
end;

var
  Results: array[0..2] of LongWord;
  FailureCode, X: Integer;
begin
  if FillBrandName then
    begin
      WriteLn('CPU = ', CpuName);
      X := 0;
      while CpuName[X] <> #0 do
        begin
          CpuName[X] := '-';
          Inc(X);
        end;
      WriteLn('------', CpuName);
    end;
  Results[0] := Benchmark('   Pascal control case', @Checksum_PAS, 5000000, 1000);
  Results[1] := Benchmark(' Using LEA instruction', @Checksum_LEA, 5000000, 1000);
  Results[2] := Benchmark('Using ADD instructions', @Checksum_ADD, 5000000, 1000);
  
  FailureCode := 0;

  if (Results[0] <> Results[1]) then
    begin
      WriteLn('ERROR: Checksum_LEA doesn''t match control case');
      FailureCode := FailureCode or 1;
    end;
  if (Results[0] <> Results[2]) then
    begin
      WriteLn('ERROR: Checksum_ADD doesn''t match control case');
      FailureCode := FailureCode or 2
    end;
    
  if FailureCode <> 0 then
    Halt(FailureCode);
end.


More information about the fpc-devel mailing list