<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body bgcolor="#FFFFFF" text="#000000">
    <p><br>
    </p>
    <br>
    <div class="moz-cite-prefix">On 14/05/2018 04:30, David Pethes
      wrote:<br>
    </div>
    <blockquote type="cite"
      cite="mid:39dbca64-290e-580e-adec-f31d0a84fd67@satd.sk">
      <pre wrap="">Hi,
I would welcome inlining of (simple) asm routines. </pre>
      <br>
    </blockquote>
    I do not know what you consider to be the existing obstacles to
    inlining assembler routines. What I do know is that in the attached
    program, inlining does work. It summarises my (current)
    understanding of how to measure time with nanosecond reliability<br>
    (asking for time via the Linux function "if
    clock_gettime(CLOCK_MONOTONIC, @ts)=0 then" does indeed return
    nanoseconds, but takes some 270 ns (or about 1000 clock ticks) to
    execute and thus does not produce nanosecond reliability)<br>
    but repeated measurements do not produce the same output, and
    therefore my little program does not have the reliability I want.
    Statistical processing does something to improve the situation, but
    not quite what I want.<br>
    <br>
    What I can say about inlining assembler routines is this: if the
    variables onto which registers are to be saved are on the stack,
    they can be inlined. Never mind the hints in Lazarus' message pane.
    Take the <br>
    <font size="-1"><i>function GetProcessorUsed: longint;    inline;</i><i><br>
      </i><i>var</i><i><br>
      </i><i>  ProcUsed: longint;</i><i><br>
      </i><i>begin</i><i><br>
      </i><i>  asm</i><i><br>
      </i><i>    CPUID</i><i><br>
      </i><i>    .byte 0x0F, 0x01, 0xF9      // read the Time-Stamp
        Counter rdtscp (as op-code format),</i><i><br>
      </i><i>    movl %ecx, ProcUsed      // This is the processor on
        which measurements take place. Measurements on other processors
        are discarded.</i><i><br>
      </i><i>  end  ['eax','ebx','ecx','edx'];</i><i><br>
      </i><i>  GetProcessorUsed:=ProcUsed;</i><i><br>
      </i><i>end;</i></font><br>
    Because <font size="-1"><i>ProcUsed</i></font> is on the stack, I
    can move %ecx into it. But I cannot get %ecx directly into <i><font
        size="-1">GetProcessorUsed</font></i>. That requires a separate
    line of code. <br>
    <br>
    wolf<br>
    <br>
    Here is the full code, as promised. If anybody has a suggestion on
    how to improve it, please let me know, in a separate thread.<br>
    <br>
    <i><font size="-1">program Speed_Test;<br>
        {$ASMMODE att}<br>
        <br>
        uses sysutils, Linux, math;<br>
        type<br>
          TtscCount = record<br>
              Group: longint;<br>
              Count: longint;<br>
              CumFreq: Int64;<br>
              end;<br>
        type<br>
          TCumFreq = record<br>
              Group: longint;<br>
              CumFreq: real;<br>
              end;<br>
          TCumFrequency= array of TCumFreq;<br>
          TTimeSpec = record<br>
            tv_sec: int64;  //time_t;    //Seconds<br>
            tv_nsec: int64; //clong;     //Nanoseconds<br>
          end;<br>
        var<br>
          TscCount: array of TtscCount;<br>
          Measured: TCumFrequency;<br>
          MeasurementsToDo: int64=1000000;<br>
          ProcessorUsed: LongInt;<br>
          Range: array[0..9999] of longint;<br>
          ValidMeasurements: Int64;<br>
        <br>
        function Get_ClockFreq(CPU: Char): real;<br>
        {Since there is no way I can find to extract actual clock
        frequency, I read it from /proc/cpuinfo }<br>
        var<br>
          FileHandle: LongInt;<br>
          i: integer;<br>
          Data: ansistring;<br>
          rc:real;<br>
          NumRead: int64;<br>
          Buffer : packed array[0..4095] of char;<br>
          SourceFile: AnsiString= '/proc/cpuinfo';<br>
        begin<br>
          if not FileExists(SourceFile) then<br>
          begin<br>
            writeln('Error: Input file "',SourceFile,'" has not been
        found');<br>
            halt;<br>
          end;<br>
          FileHandle:=FileOpen('/proc/cpuinfo',fmOpenRead);<br>
          NumRead:=FileRead(FileHandle, Buffer,SizeOf(Buffer));<br>
          Data:=Buffer[0..NumRead];<br>
          i:=0;<br>
          while i<=NumRead do<br>
          begin<br>
            inc(i);<br>
            if CompareText(Data[i..i+8],'Processor')=0 then<br>
            begin<br>
              if char(Data[i+12])=CPU then<br>
              begin<br>
                i:=i+12;<br>
                repeat inc(i); until CompareText(Data[i..i+6],'cpu
        MHz')=0 ;<br>
                try<br>
                  rc:=StrToFloat(Data[i+11..i+18]);<br>
                except<br>
                on E : exception do<br>
                  begin<br>
                    writeln('Data read error: cannot convert
        ',Data[i+11..i+18],' into number');<br>
                    writeln('Program aborted');<br>
                    halt;<br>
                  end;<br>
                end;<br>
                break;<br>
              end;<br>
            end;<br>
          end;<br>
          FileClose(FileHandle);<br>
          Get_ClockFreq:=rc;<br>
        end;<br>
        <br>
        procedure ReadProcessorFrequencyInformationLeaf;  inline;<br>
        var<br>
          CPUID_16H_AX: Word;      // Processor Base Frequency (in MHz)<br>
          CPUID_16H_BX: Word;      // Maximum Frequency (in MHz)<br>
          CPUID_16H_CX: Word;      // Bus (Reference) frequency (in MHz)<br>
          CPUID_16H_DX: Word;      // Reserved = 0<br>
        begin<br>
          CPUID_16H_AX:=0;<br>
          CPUID_16H_BX:=0;<br>
          CPUID_16H_CX:=0;<br>
          asm<br>
            mov $0x16, %eax               // select Processor Frequency
        Information Leaf 0x16<br>
            cpuid                         // access it<br>
            mov %ax, CPUID_16H_AX         // Processor Base Frequency
        (in MHz)<br>
            mov %bx, CPUID_16H_BX         // Maximum Frequency (in MHz)<br>
            mov %cx, CPUID_16H_CX         // Bus (Reference) frequency
        (in MHz)<br>
            mov %dx, CPUID_16H_DX      // Reserved = 0<br>
          end  ['ax','bx','cx','dx'];<br>
        end;<br>
        <br>
        function GetProcessorUsed: longint;    inline;<br>
        var<br>
          ProcUsed: longint;<br>
        begin<br>
          asm<br>
            CPUID<br>
            .byte 0x0F, 0x01, 0xF9      // read the Time-Stamp Counter
        rdtscp (as op-code format),<br>
            movl %ecx, ProcUsed    // This is the processor on which
        measurements take place. Measurements on other processors are
        discarded.<br>
          end  ['eax','ebx','ecx','edx'];<br>
          GetProcessorUsed:=ProcUsed;<br>
        end;<br>
        <br>
        procedure MeasureCode;<br>
        var<br>
          ts: TTimeSpec;<br>
          MilliSecondTime: extended;<br>
          AX, BX, CX: Word;<br>
          Start,Stop,i,k,l: int64;   // saves starting value from the
        Time Stamp counter<br>
          Hi: int64;<br>
          x:real;<br>
          y: real=2;<br>
          ProcessorUsed_Start, ProcessorUsed_Stop, ProcUsed: longint;<br>
          IA32_TSC_AUX_Base,IA32_TSC_AUX_Core: longint;     // content
        of IA32_TSC_AUX MSR register = which CPU?<br>
          Clock_denominator: Cardinal;<br>
          Clock_numerator: Cardinal;<br>
          CPUID_15H_ECX, CPUID_15H_EDX: Cardinal;     // reserved = 0<br>
          ProcessorBaseFrequency: Word;<br>
        begin<br>
          for i:=0 to High(Range) do Range[i]:=0;<br>
          Start:=0;   Stop:=0;<br>
            for k:=0 to 4 do ReadProcessorFrequencyInformationLeaf;   //
        this loop is just for warm-up<br>
          ProcessorUsed:=GetProcessorUsed;<br>
          for i:=1 to MeasurementsToDo do<br>
          begin<br>
            Start:=0;   Stop:=0;<br>
            asm<br>
              cpuid                  // force serialization<br>
              .byte 0x0F, 0x01, 0xF9 // read the Time-Stamp Counter
        rdtscp (as op-code format),<br>
              movl %eax, Start+0     // save least-significant longword<br>
              movl %edx, Start+4     // save most-significant longword<br>
              movl %ecx, ProcessorUsed_Start<br>
            end  ['eax','ebx','ecx','edx'];<br>
            // insert instruction to be tested below this line<br>
        <br>
        if clock_gettime(CLOCK_MONOTONIC, @ts)=0 then    // return time
        in milliseconds, rounded to 1 nanosecond<br>
        MilliSecondTime:=RoundTo(1e3*ts.tv_sec+1e-6*ts.tv_nsec,-6);<br>
        <br>
        // insert instruction to be tested above this line<br>
            asm<br>
              .byte 0x0F, 0x01, 0xF9 // read the Time-Stamp Counter
        rdtscp (as op-code format),<br>
              movl %eax, Stop+0      // save least-significant longword<br>
              movl %edx, Stop+4      // save most-significant longword<br>
              movl %ecx, ProcessorUsed_Stop<br>
              cpuid<br>
            end  ['eax','ebx','ecx','edx'];<br>
            if (ProcessorUsed_Start=ProcessorUsed) and
        (ProcessorUsed_Stop=ProcessorUsed) then   // ignore measurements
        that were not done on ProcessorUsed<br>
              if (Stop-Start<High(Range)) then inc(Range[Stop-Start])
        else inc(Range[High(Range)]);  // build cumulative frequency
        array<br>
          end;<br>
        end;<br>
        <br>
        function AccumulateValidMeasurements: Int64;<br>
        var<br>
          i: int64;<br>
          Hi: int64;<br>
        begin<br>
          ValidMeasurements:=0;<br>
          for i:=1 to High(Range) do
        ValidMeasurements:=ValidMeasurements+Range[i];<br>
          AccumulateValidMeasurements:=ValidMeasurements;<br>
        <br>
          SetLength(TscCount,1);<br>
          TscCount[0].Group:=0;<br>
          TscCount[0].CumFreq:=Range[0];<br>
          for i:=0 to High(Range)-1 do<br>
          begin<br>
            if Range[i]>0 then<br>
            begin<br>
              Hi:=High(TscCount)+1;<br>
              SetLength(TscCount,Hi+1);<br>
              TscCount[Hi].Group:=i;<br>
              TscCount[Hi].Count:=Range[i];<br>
              TscCount[Hi].CumFreq:=Range[i]+TscCount[Hi-1].CumFreq;<br>
            end;<br>
          end;<br>
        end;<br>
        <br>
        procedure ShowMeasurements;<br>
        var<br>
          i: int64;<br>
        begin<br>
          for i:=1 to High(TscCount) do<br>
          begin<br>
          writeln(TscCount[i].Group,'   ',TscCount[i].Count,'  
        ',TscCount[i].CumFreq,'   ',100*Measured[i].CumFreq:6:3);<br>
          if Measured[i].Cumfreq>0.999 then exit;<br>
          end;<br>
        end;<br>
        <br>
        procedure EvaluateMeasurements;<br>
        var<br>
          i: int64;<br>
        begin<br>
          SetLength(Measured,Length(TscCount));<br>
          Measured[0].Group:=0;<br>
          Measured[0].CumFreq:=TscCount[0].CumFreq/ValidMeasurements;<br>
          for i:=0 to High(TscCount) do<br>
          begin<br>
            Measured[i].Group:=TscCount[i].Group;<br>
            Measured[i].CumFreq:=TscCount[i].Cumfreq/ValidMeasurements;<br>
          end;<br>
        end;<br>
        <br>
        function Limit(CF: TCumFrequency; Frequency: real):real;  // do
        linear interpolation between two points of cumulative frequency
        curve<br>
        var<br>
          i: int64;<br>
          Slope,Intercept: real;<br>
        begin<br>
          i:=0;<br>
          while (CF[i].CumFreq<= Frequency) do inc(i);<br>
         
        Intercept:=(CF[i-1].Group*CF[i].CumFreq-CF[i].Group*CF[i-1].CumFreq)
        /(CF[i].CumFreq-CF[i-1].CumFreq);<br>
         
        Slope:=(CF[i].Group-CF[i-1].Group)/(CF[i].CumFreq-CF[i-1].CumFreq);<br>
          Limit:=Frequency*Slope+Intercept;<br>
        end;<br>
        <br>
        procedure Difference;<br>
        begin<br>
          writeln;<br>
          writeln('Clock ticks used:  ');<br>
          writeln('1% Limit=',Limit(Measured,0.01):6:2);<br>
          writeln('5% Limit=',Limit(Measured,0.05):6:2);<br>
          writeln('20% Limit=',Limit(Measured,0.20):6:2);<br>
          writeln('Median=',Limit(Measured,0.50):6:2);<br>
          writeln('80% Limit=',Limit(Measured,0.80):6:2);<br>
          writeln('95% Limit=',Limit(Measured,0.95):6:2);<br>
          writeln('99% Limit=',Limit(Measured,0.99):6:2);<br>
        end;<br>
        <br>
        begin<br>
         // taskset -c 1;               // taskset allows to change
        processor, if used from the command line<br>
          MeasureCode;<br>
          ValidMeasurements:=AccumulateValidMeasurements;      //<br>
          writeln('Tests done on processor ',ProcessorUsed,', running at
        ',Get_ClockFreq(IntToStr(ProcessorUsed)[1]):7:3,'MHz',' doing
        ',ValidMeasurements,' valid measurements');      //<br>
          EvaluateMeasurements;<br>
          ShowMeasurements;<br>
          Difference;<br>
        end.</font></i><br>
    <br>
    <blockquote type="cite"
      cite="mid:39dbca64-290e-580e-adec-f31d0a84fd67@satd.sk">
      <pre wrap="">_______________________________________________
fpc-devel maillist  -  <a class="moz-txt-link-abbreviated" href="mailto:fpc-devel@lists.freepascal.org">fpc-devel@lists.freepascal.org</a>
<a class="moz-txt-link-freetext" href="http://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel">http://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel</a>
</pre>
    </blockquote>
    <br>
  </body>
</html>