[fpc-pascal] Floating Point Performance on Intel

Wed Mar 30 06:42:50 CEST 2005

On Mar 28, 2005, at 10:19, Raw Magick DOT COM wrote:

>
>      A := 0;
>      B := 0.9;
>      For X := 0 to 10000000 do
>      begin
>           A := A + X;
>           A := A * B;
>      end;

> # Var A located at ebp-4
> # Var B located at ebp-8
> # Var X located at ebp-12
>
> //A + B are set up before here - its the loop thats interrsting
>
> # [44] For X := 0 to 10000000 do
>         movl    $0,-12(%ebp)
>         decl    -12(%ebp)
>         .balign 4
> .L31:
>         incl    -12(%ebp)
> # [46] A := A + X;
>         flds    -4(%ebp)
>         fildl   -12(%ebp)
>         faddp   %st,%st(1)
>         fstps   -4(%ebp)
> # [47] A := A * B;
>         flds    -8(%ebp)
>         fmuls   -4(%ebp)
>         fstps   -4(%ebp)
>         cmpl    $10000000,-12(%ebp)
>         jl      .L31
>

> The above code takes about 210ms to perform on my machine. Below is my
> own assembler which takes about 100ms ( apologies it is in a slightly
> different format )
>
> asm
>    mov eax, 0; //Set up loop counter
>    @StartOfLoop:
>    mov dword ptr[x], eax; // Move its value into X ( on stack )
>    FILD dword ptr[x]; //Load into floating point
>    FADD dword ptr[A]; // Add A ( on Stack ) to it
>    FMUL dword ptr[B]; //Multiply by B ( on Stack )
>    FSTP dword ptr[A]; // Pop into A
>    add eax, 1; //Inc loop counter
>    cmp eax, 10000000; // Test Jump condition
>    jl @StartOfLoop;
> end;
>
> My question is, what needs to be done to the compiler to make it
> optimise as well as C compilers, or perhaps I am missing some compiler
> switches.
>

I expect you are using x86 family (may be even IA-32 architecture).
what about writing your code as:

      A := 0;
      B := 0.9;
      For X := 10000000 downto 0 do
      begin
           A := A + X;
           A := A * B;
      end;

asm
    mov ecx, 10000000; //Set up loop counter
    @StartOfLoop:
    mov dword ptr[x], ecx; // Move its value into X ( on stack )
    FILD dword ptr[x]; //Load into floating point
    FADD dword ptr[A]; // Add A ( on Stack ) to it
    FMUL dword ptr[B]; //Multiply by B ( on Stack )
    FSTP dword ptr[A]; // Pop into A
    loop @StartOfLoop; //Dec loop counter
end;

May be
    sub ecx, 1; //Dec loop counter
    jnz @StartOfLoop;
instead of
    loop  @StartOfLoop;
can be better.

What are your results?

Variable alignment should have also a big influence.
for details see:
IA-32 Intel® Architecture Optimization Reference Manual
ftp://download.intel.com/design/Pentium4/manuals/24896611.pdf

Jan Ruzicka