[fpc-pascal] Re: fast text processing

S. Fisher expandafter at yahoo.com
Thu Nov 1 19:34:32 CET 2007


--- L <L at z505.com> wrote:
> > > No more strlen:
> > > http://www.hu.freepascal.org/fpcircbot/cgipastebin?msgid=1432
> > 
> 
> This doesn't work if you have spaces in front of the < tags >
> 
>   <sometag>
>       <sometag>
> 
> I'm not sure if the Perl one fails too though.
> I don't have perl installed and can't test it ;-)
> 
> A real parser doesn't care  about whitespace in front.
> And will be a bit slower.. because of that check.

{$MODE OBJFPC} {$H+}

uses sysutils, strings, contnrs;


const
  chars : set of char = ['a'..'z','0'..'9'];

var
  f: text;
  line : ansistring;
  p, pword : pchar;
  saved: char;
  wc : longint;
  counting, good : boolean;
  unique: TFPStringHashTable;
  textbuf: array[1..4096] of byte;
  when : tDateTime;

function do_tag( var s: ansistring; var p: pchar):boolean;
var
  pword: pchar;
begin
  pword := p;
  while p^ <> '>' do
    inc(p);
  p^ := #0;
  result := ('<title'=pword) or ('<text'=pword);
end;

  
begin
 when := time;

  assign(f, 'Koleksi.dat');
  reset(f);
  SetTextBuf(f, textbuf, sizeof(textbuf));
  wc := 0;  counting := false;
  unique := TFPStringHashTable.Create;
  while not eof(f) do
  begin
    readln(f, line );
    if '' = line then continue;
    line := lowercase( line );
    p := pchar( line );
    repeat
      // Skip junk.
      while (p^ <> #0) and (not (p^ in chars)) do
      begin
        if '<' = p^ then
          counting := do_tag( line, p );
        inc(p);
      end;
      // Build word.
      pword := p;
      good := true;
      while p^ in chars do
      begin
        if not (p^ in ['a'..'z']) then good := false;
        inc(p);
      end;
      if counting and good then
        if pword <> p then
        begin
          saved := p^;
          p^ := #0;
          inc( wc );
          if unique.Find( pword) = nil then
            unique.Add( pword,'');
          p^ := saved;
        end
    until #0 = p^;
  end;

  close(f);
  writeln( ((time-when)*secsPerDay):0:3 );
  WriteLn('Word count:',wc, #10'Unique word count:', unique.Count);
end.
{
Word count: 126944
Unique word count: 11793
}


__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 



More information about the fpc-pascal mailing list