[fpc-pascal] Example: regular expressions and "hash-tables"
S. Fisher
expandafter at yahoo.com
Fri Mar 22 09:31:42 CET 2013
--- On Wed, 3/20/13, S. Fisher <expandafter at yahoo.com> wrote:
> The program reads a text file and counts the number of
> unique words,
> and also displays the number of times the most common word
> was found.
>
Added an iterator of sorts for regular expressions. This allows
if re.exec( line ) then begin
si_table_inc( table, re.match[0] );
while re.execNext do
si_table_inc( table, re.match[0] );
end;
to be replaced by
re_each_match( re, line, @count_the_word );
with no loss of speed.
{$mode objfpc}
uses
AvgLvlTree,
regexpr,
classes,
dateutils,
Sysutils;
const
file_to_process = 'Bartlett--Quotations.txt';
type
// string-integer table
t_si_table = TStringToPointerTree;
t_si_item = PStringToPointerItem;
procedure si_table_put( table: t_si_table; k: string; v: longint );
begin
table[ k ] := pointer( v )
end;
function si_table_get( table : t_si_table; key : string ): longint;
begin
result := longint( table[ key ] )
end;
procedure si_table_inc( table: t_si_table; k: string );
begin
table[ k ] := pointer( si_table_get( table, k ) + 1 ) ;
end;
function si_table_key( item : t_si_item ): string;
begin
result := item^.name
end;
function si_table_val( item : t_si_item ): longint;
begin
result := longint( item^.value )
end;
function si_table_new( sensitive : boolean ): t_si_table;
begin
result := t_si_table.create( sensitive )
end;
type
t_regex_proc = procedure( const text : string );
procedure re_each_match( re : TRegExpr;
const text: string;
proc: t_regex_proc ); overload;
begin
if re.exec( text ) then begin
proc( re.match[0] );
while re.execNext do
proc( re.match[0] );
end;
end;
procedure re_each_match( const re_str, text: string;
proc: t_regex_proc ); overload;
var
re : TRegExpr;
begin
re := TRegExpr.Create;
re.Expression := re_str;
re_each_match( re, text, proc );
re.free
end;
var
table : t_si_table ;
table_item : t_si_item ;
line, word : string;
re : TRegExpr;
max_count, count : int32;
lines : TStringList;
time_1 : TDateTime;
procedure count_the_word( const s : string );
begin
si_table_inc( table, s )
end;
begin
time_1 := time;
re := TRegExpr.Create;
re.Expression := '(?i)[a-z]+' ; // case-insensitive
table := si_table_new( true ) ; // true to make it case-sensitive.
lines := TStringList.Create;
lines.LoadFromFile( file_to_process );
for line in lines do
re_each_match( re, line, @count_the_word );
lines.free;
writeln( 'Unique word count: ', table.count );
max_count := 0;
for table_item in table do begin
count := si_table_val( table_item );
if count > max_count then begin
word := si_table_key( table_item );
max_count := count
end;
end;
writeln( 'Commonest word: ', word, ' (', max_count, ')' );
writeln( 'Elapsed time: ',
milliSecondsBetween( time_1, time ),
' milliseconds.' );
table.free;
re.free
end.
More information about the fpc-pascal
mailing list