[fpc-devel] Attn: J. Gareth // 3.3.1 opt = slower // Fwd: [Lazarus] Faster than popcnt
Martin Frb
lazarus at mfriebe.de
Mon Jan 3 12:54:38 CET 2022
Hi Gareth,
not sure if this is of interest to you, but I see you do a lot on the
optimizer....
While testing the attached, I found that one of the functions was
notable slower when compiled with 3.3.1 (compared to 3.2.3).
So maybe something you are interested in looking at?
The Code in "Utf8LengthFash" (fst) went from around 600ms to 700ms.
3.3.1 from Dec 10th
3.2.3 from Dec 9th
Core I7 8700K
-O4 -Cpcoreavx2
fpc 3.2.3 / fpc 3.3.1
fst 594 fst 688
fst 578 fst 703
fst 578 fst 687
fst 562 fst 688
pop 485 pop 485
pop 500 pop 500
pop 500 pop 484
pop 484 pop 500
add 594 add 422
add 578 add 438
add 578 add 437
add 594 add 453
-------------- next part --------------
//
// (C) 2021 Martin Friebe and Marco van de Voort.
// attempt to accelerate utf8lengthfast which is a length(s) in utf8 codepoints without integrity checking
//
// 4 versions.
// - Original,
// - with popcount and
// - the "add" variant that accumulates 127 iterations of ptrints and only adds
// the intermeidates outside that loop
// - a SSE2 version loosely inspired by the add variant combined with
// the core of an existing (branchless) binarization routine for the main loop.
{$mode objfpc}{$H+}
{$asmmode intel}
{$coperators on}
{define asmdebug}
uses SysUtils,StrUtils;
const
mask3 : array[0..15] of byte = ( $C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0);
mask4 : array[0..15] of byte = ( $80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80);
mask2 : array[0..15] of byte = ( $1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1);
// Integer arguments are passed in registers RCX, RDX, R8, and R9.
// Floating point arguments are passed in XMM0L, XMM1L, XMM2L, and XMM3L.
// volatile: RAX, RCX, RDX, R8, R9, R10, R11
// nonvolatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are considered nonvolatile
// volatile xmm0-xmm3 (params) en xmm4,5
// https://msdn.microsoft.com/en-us/library/ms235286.aspx
{$ifdef asmdebug}
function asmutf8length(const s : pchar;len:integer,res:pbyte):int64;
{$else}
function asmutf8length(const s : pchar;len:integer):int64;{$ifndef Windows}assembler; nostackframe;{$endif}
{$endif}
{$ifdef Windows}
begin
{$endif}
asm
// tuning for short strings:
// ------
{$ifndef Windows}
// we can't use [s] as an alias for the pointer parameter, because the non assembler procedure on Windows
// changes that into a stack reference. FPC doesn't support non volatile frame management for assembler procs like Delphi does.
mov rcx,s // rdi
mov edx,len // rsi
{$endif}
test rax,rax
je @theend
cmp rdx,128 // threshold between long and short.
jl @restbytes
mov rax,rdx
mov r10,rcx
and r10,15
mov r9,16
sub r9,r10
and r9,15
test r9,r9
je @nopreloop
sub rdx,r9
@preloop: // roughly 2 cycles per iteration on ivy bridge
movzx r11d, byte [rcx] // unaligned bytes after sse loop
mov r10,r11
shr r10,7
not r11
shr r11,6
and r10,r11
sub rax,r10
inc rcx
dec r9
jne @preloop
@nopreloop:
mov r9,rdx
and r9,15
shr rdx,4
pxor xmm5,xmm5 // always zero
pxor xmm6,xmm6 // dword counts
// using broadcast etc raises requirements? -> use constant loads.
movdqu xmm1,[rip+mask3]
movdqu xmm2,[rip+mask4]
movdqu xmm3,[rip+mask2]
test rdx,rdx
je @restbytes
@outer:
mov r10,127 // max iterations per inner loop
cmp r10,rdx // more or less left?
jl @last // more
mov r10,rdx // less
@last:
sub rdx,r10 // iterations left - iterations to do
pxor xmm4,xmm4
// process 127 iterations (limit of signed int8)
@inner: // +/- 2.2 cycles per iteration for 16 bytes on ivy bridge
movdqu xmm0, [rcx]
pand xmm0,xmm1 // mask out top 2 bits
pcmpeqb xmm0,xmm2 // compare with $80.
pand xmm0,xmm3 // change to $1 per byte.
paddb xmm4,xmm0 // add to cumulative
add rcx,16
dec r10
jne @inner
// SSSE3 vertical adds might help this, but increase CPU reqs.
movdqa xmm0,xmm4
PUNPCKLBW xmm0,xmm5 // zero extend to words
PUNPCKHBW xmm4,xmm5
paddw xmm0,xmm4 // add, now 8 16-bit words.
movdqa xmm4,xmm0
PUNPCKLWD xmm0,xmm5 // zero extend to dwords
paddd xmm6,xmm0
PUNPCKHWD xmm4,xmm5
paddd xmm6,xmm4 // add both L and H to cumulative 4x dword xmm6 reg
test rdx,rdx
jne @outer
MOVHLPS xmm4,xmm6 // move high 8 bytes to low (no float->int penalty for move only?)
paddd xmm6,xmm4 // add both 2*dwords (high doesn't matter)
pshufd xmm4,xmm6,1 // mov 2nd dword in xmm6 to first in xmm4
paddd xmm6,xmm4 // add
movd edx,xmm6 // to int alu reg
sub rax,rdx // subtract from length in bytes.
@restbytes:
test r9,r9
je @theend // Done!
@restloop:
movzx edx, byte [rcx] // unaligned bytes after sse loop
mov r10,rdx
shr r10,7
not rdx
shr rdx,6
and r10,rdx
sub rax,r10
inc rcx
dec r9
jne @restloop
@theend:
{$ifdef Windows}
end['xmm6']; // volatile registers used.
{$endif}
end;
function countmask(nx:int64):integer;
begin
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
end;
function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
//Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
Result += PopCnt(qword(nx));
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,j,cnt,e, bc: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
bc := (ByteCount-cnt) div sizeof(PtrInt);
for j := 1 to bc >> 7 do begin
nx := 0;
for i := 0 to 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
if (bc and 127) > 0 then begin
nx := 0;
for i := 1 to bc and 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
chr(%00001001),chr(%01001001));
function pseudorandomutf8string(len:integer;var cnt:integer):string;
// random string but keep a count of bytes with high value %10
var lcnt:integer;
i,j:integer;
begin
setlength(result,len);
lcnt:=0;
for i:=1 to length(result) do
begin
j:=random(4);
//j:=i and 3;
if j=1 then inc(lcnt);
result[i]:=pattern[j];
end;
cnt:=length(result)-lcnt;
end;
var r : array[0..10000] of byte; // FPC "registers" dialog is poor, we use this for writeln like dumping
procedure testasmutf8length;
const testlen = 64*100;
var s : string;
cnt : integer;
rx : int64;
begin
randomize;
s:=pseudorandomutf8string(testlen+Random(50),cnt);
rx:=asmutf8length(pchar(s),length(s){$ifdef asmdebug}, at r[0]{$endif});
writeln(inttohex(cnt,2),' = ',inttohex(rx,2),' ',inttohex(length(s)-cnt,2),' = ',inttohex(length(s)-rx,2)); // hex because most register dumps are easier in hex.
{$ifdef asmdebug}
for i:=0 to 6 do
begin
write(i:2,' ');
for j:=0 to 3 do
write(inttohex(pdword(@r[i*16+j*4])^,8), ' ');
writeln;
end;
{$endif}
end;
var
a: ansistring;
t: QWord;
i, j, ii: Integer;
begin
{$ifdef asmdebug}
testasmutf8length;
{$else}
a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و';
a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a;
writeln(Length(a));
writeln('fst:',UTF8LengthFast(@a[1], Length(a)));
writeln('pop:',UTF8LengthPop(@a[1], Length(a)));
writeln('add:',UTF8LengthAdd(@a[1], Length(a)));
writeln('asm:',asmUTF8Length(@a[1], Length(a){$ifdef asmdebug}, at r[0]{$endif}));
WriteLn();
writeln(Length(a) div 8);
WriteLn();
for ii := 0 to 1 do begin
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthFast(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('fst ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthPop(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('pop ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthAdd(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('add ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif} Length(a));
t := GetTickCount64 - t;
writeln('asm ',t);
end;
end;
{$endif}
{$ifndef FPC}
if debughook<>nil then // runtime debugger detection
{$endif}
readln;
end.
-------------- next part --------------
--
_______________________________________________
lazarus mailing list
lazarus at lists.lazarus-ide.org
https://lists.lazarus-ide.org/listinfo/lazarus
More information about the fpc-devel
mailing list