[fpc-pascal] regex-dna: finally fast enough?
S. Fisher
expandafter at yahoo.com
Sun Nov 11 05:08:22 CET 2007
I don't think so, although it's over twice as fast as the
last incarnation.
One speedup I stole from the Perl program:
instead of counting matches for /foo|bar/, count matches
for /foo/ and for /bar/.
The other speedup is lowercasing the string that is searched
instead of requiring the regex engine to do a case-insensitive
search.
I don't think this should be submitted to the shootout site
unless it will improve Free Pascal's standing; i.e., unless
it is not more than 1.4 times as slow as the Perl program.
{ The Computer Language Benchmarks Game
http://shootout.alioth.debian.org
contributed by Steve Fisher
modified by Peter Vreman
modified by Steve Fisher
}
uses regexpr;
const
patterns : array[1..9] of string[255] =
(
'agggtaaa|tttaccct',
'[cgt]gggtaaa|tttaccc[acg]',
'a[act]ggtaaa|tttacc[agt]t',
'ag[act]gtaaa|tttac[agt]ct',
'agg[act]taaa|ttta[agt]cct',
'aggg[acg]aaa|ttt[cgt]ccct',
'agggt[cgt]aa|tt[acg]accct',
'agggta[cgt]a|t[acg]taccct',
'agggtaa[cgt]|[acg]ttaccct'
);
replacements : array[0..10,0..1] of string[15] =
(
('B', '(c|g|t)'), ('D', '(a|g|t)'), ('H', '(a|c|t)'), ('K', '(g|t)'),
('M', '(a|c)'), ('N', '(a|c|g|t)'), ('R', '(a|g)'), ('S', '(c|t)'),
('V', '(a|c|g)'), ('W', '(a|t)'), ('Y', '(c|t)')
);
// Append 2 strings to an ansistring rapidly. Note: the ansistring's
// length will be increased by a more than sufficient amount.
function append2( var dest: ansistring; len0: longint;
s1: pchar; len1: longint;
s2: pchar; len2: longint): longint; inline;
const quantum = 599000;
var newlength: longint;
begin
newlength := len0 + len1 + len2;
// Since setlength() is somewhat costly, we'll do it less
// often than you would think.
if length( dest ) < newlength then
setlength( dest, newlength + quantum );
move( s1^, dest[len0 + 1], len1 );
move( s2^, dest[len0 + 1 + len1], len2 );
exit( newlength );
end;
procedure replace_matches( const str: ansistring; var dest: ansistring );
var
engine : tRegexprEngine;
starti, index, size, truelength, i : longint;
pstart : pchar;
target, repl: string[255];
begin
target := '[';
for i := 0 to high(replacements) do
target += replacements[i,0];
target += ']' + #0;
GenerateRegExprEngine( @target[1], [], engine);
dest := ''; truelength := 0;
starti := 1;
pstart := pchar(str);
while starti <= length(str) do
if RegExprPos(engine, pstart, index, size ) then
begin
repl := replacements[ pos( (pstart+index)^ , target)-2, 1 ];
truelength := append2(
dest, truelength, pstart, index, @repl[1], length(repl) );
inc(pstart, index+size);
inc(starti, index+size);
end
else
break;
DestroyRegExprEngine( engine );
setlength( dest, truelength );
dest := dest + Copy( str, starti, length(str)-starti+1);
end;
function count_matches_simple( pattern: pchar; const str: ansistring ):
longint;
var
engine : tRegexprEngine;
p_start, p_end : pchar;
count, index, size : longint;
begin
GenerateRegExprEngine( pattern, [], engine);
count := 0;
p_start := pchar(str);
p_end := @str[ length(str) ];
while p_start <= p_end do
if RegExprPos(engine, p_start, index, size ) then
begin
inc(count);
inc(p_start, index+size);
end
else
break;
DestroyRegExprEngine( engine );
exit(count)
end;
function count_matches( pattern: string[255]; const str: ansistring ):
longint;
var
count, p: longint;
begin
pattern += #0;
p := pos( '|', pattern );
pattern[p] := #0;
count := count_matches_simple( @pattern[1], str );
count += count_matches_simple( @pattern[p+1], str );
exit( count )
end;
var
sequence, new_seq, lowered : ansiString;
line: string[255];
i, count, init_length, clean_length : longint;
inbuf : array[0..64*1024] of char;
begin
settextbuf(input,inbuf);
sequence := '';
init_length := 0;
while not eof do
begin
readln( line );
init_length += length( line ) + 1;
if line[1] <> '>' then
sequence := sequence + line;
end;
clean_length := length(sequence);
// Count pattern-matches.
lowered := lowercase( sequence );
for i := low(patterns) to high(patterns) do
begin
count := count_matches( patterns[i], lowered );
writeln( patterns[i], ' ', count);
end;
// Replace.
replace_matches(sequence, new_seq);
writeln;
writeln( init_length );
writeln( clean_length );
writeln( length(new_seq) );
end.
__________________________________________________
Do You Yahoo!?
Tired of spam? Yahoo! Mail has the best spam protection around
http://mail.yahoo.com
More information about the fpc-pascal
mailing list