[fpc-pascal] Re: fast text processing
S. Fisher
expandafter at yahoo.com
Thu Nov 1 19:34:32 CET 2007
--- L <L at z505.com> wrote:
> > > No more strlen:
> > > http://www.hu.freepascal.org/fpcircbot/cgipastebin?msgid=1432
> >
>
> This doesn't work if you have spaces in front of the < tags >
>
> <sometag>
> <sometag>
>
> I'm not sure if the Perl one fails too though.
> I don't have perl installed and can't test it ;-)
>
> A real parser doesn't care about whitespace in front.
> And will be a bit slower.. because of that check.
{$MODE OBJFPC} {$H+}
uses sysutils, strings, contnrs;
const
chars : set of char = ['a'..'z','0'..'9'];
var
f: text;
line : ansistring;
p, pword : pchar;
saved: char;
wc : longint;
counting, good : boolean;
unique: TFPStringHashTable;
textbuf: array[1..4096] of byte;
when : tDateTime;
function do_tag( var s: ansistring; var p: pchar):boolean;
var
pword: pchar;
begin
pword := p;
while p^ <> '>' do
inc(p);
p^ := #0;
result := ('<title'=pword) or ('<text'=pword);
end;
begin
when := time;
assign(f, 'Koleksi.dat');
reset(f);
SetTextBuf(f, textbuf, sizeof(textbuf));
wc := 0; counting := false;
unique := TFPStringHashTable.Create;
while not eof(f) do
begin
readln(f, line );
if '' = line then continue;
line := lowercase( line );
p := pchar( line );
repeat
// Skip junk.
while (p^ <> #0) and (not (p^ in chars)) do
begin
if '<' = p^ then
counting := do_tag( line, p );
inc(p);
end;
// Build word.
pword := p;
good := true;
while p^ in chars do
begin
if not (p^ in ['a'..'z']) then good := false;
inc(p);
end;
if counting and good then
if pword <> p then
begin
saved := p^;
p^ := #0;
inc( wc );
if unique.Find( pword) = nil then
unique.Add( pword,'');
p^ := saved;
end
until #0 = p^;
end;
close(f);
writeln( ((time-when)*secsPerDay):0:3 );
WriteLn('Word count:',wc, #10'Unique word count:', unique.Count);
end.
{
Word count: 126944
Unique word count: 11793
}
__________________________________________________
Do You Yahoo!?
Tired of spam? Yahoo! Mail has the best spam protection around
http://mail.yahoo.com
More information about the fpc-pascal
mailing list