[fpc-pascal] Faster regex-dna

S. Fisher expandafter at yahoo.com
Wed Nov 7 18:07:55 CET 2007


Instead of looking for 'B' and replacing with '(c|g|t)'),
then looking for 'D' and replacing with '(a|g|t)', etc.,
the program now looks for '[BDH...Y]' and replaces with
the corresponding string.

{ The Computer Language Benchmarks Game
  http://shootout.alioth.debian.org

  contributed by Steve Fisher
  modified by Peter Vreman
  modified by Steve Fisher

  compile with
  fpc -O3 regex-dna.pp
}

uses regexpr, strutils;

const
  patterns : array[1..9] of pchar =
    (
      '(agggtaaa)|(tttaccct)',
      '([cgt]gggtaaa)|(tttaccc[acg])',
      '(a[act]ggtaaa)|(tttacc[agt]t)',
      '(ag[act]gtaaa)|(tttac[agt]ct)',
      '(agg[act]taaa)|(ttta[agt]cct)',
      '(aggg[acg]aaa)|(ttt[cgt]ccct)',
      '(agggt[cgt]aa)|(tt[acg]accct)',
      '(agggta[cgt]a)|(t[acg]taccct)',
      '(agggtaa[cgt])|([acg]ttaccct)'
    );
  replacements : array[0..10,0..1] of string[15] =
  (
    ('B', '(c|g|t)'), ('D', '(a|g|t)'), ('H', '(a|c|t)'), ('K', '(g|t)'),
    ('M', '(a|c)'), ('N', '(a|c|g|t)'), ('R', '(a|g)'), ('S', '(c|t)'),
    ('V', '(a|c|g)'), ('W', '(a|t)'), ('Y', '(c|t)')
  );


// Append 2 strings to an ansistring rapidly.  Note: the ansistring's
// length will be increased by a more than sufficient amount.
function append2( var dest: ansistring; len0: longint;
                  s1: pchar; len1: longint;
                  s2: pchar; len2: longint): longint; inline;
const  quantum = 599000;
var  newlength: longint;
begin
  newlength := len0 + len1 + len2;
  // Since setlength() is somewhat costly, we'll do it less
  // often than you would think.
  if length( dest ) < newlength then
    setlength( dest, newlength + quantum );
  move( s1^, dest[len0 + 1], len1 );
  move( s2^, dest[len0 + 1 + len1], len2 );
  exit( newlength );
end;

procedure replace_matches( const str: ansistring;  var dest: ansistring );
var
  engine : tRegexprEngine;
  starti, index, size, truelength, i : longint;
  pstart : pchar;
  target, repl, found: string[255];
begin
  target := '[';
  for i := 0 to high(replacements) do
    target += replacements[i,0];
  target += ']' + #0;
  GenerateRegExprEngine( @target[1], [], engine);
  dest := '';   truelength := 0;
  starti := 1;
  pstart := pchar(str);
  while starti <= length(str) do
    if RegExprPos(engine, pstart, index, size ) then
    begin
      found :=  str[ starti+index ];
      repl := replacements[ pos(found,target)-2, 1 ];
      truelength := append2(
        dest, truelength, @str[starti], index, @repl[1], length(repl) );
      inc(pstart, index+size);
      inc(starti, index+size);
    end
    else
      break;
  DestroyRegExprEngine( engine );
  setlength( dest, truelength );
  dest := dest + Copy( str, starti, length(str)-starti+1);
end;


function count_matches( target: pchar; const str: ansistring ): longint;
var
  engine : tRegexprEngine;
  pstart : pchar;
  starti,
  count, index, size : longint;
begin
  GenerateRegExprEngine( target, [ref_caseinsensitive], engine);
  count := 0;
  pstart := pchar(str);
  starti := 1;
  while starti <= length(str) do
    if RegExprPos(engine, pstart, index, size ) then
    begin
      inc(count);
      inc(pstart,index+size);
      inc(starti,index+size);
    end
    else
      break;
  DestroyRegExprEngine( engine );
  exit(count)
end;


var
  pattern : pchar;
  sequence, new_seq : ansiString;
  line, tmp: string[255];
  i, count, init_length, clean_length : longint;
  inbuf : array[0..64*1024] of char;
begin
  settextbuf(input,inbuf);
  sequence := '';
  init_length := 0;
  while not eof do
  begin
    readln( line );
    init_length += length( line ) + 1;
    if line[1] <> '>' then
      sequence := sequence + line;
  end;
  clean_length := length(sequence);

  for i := low(patterns) to high(patterns) do
  begin
    pattern := patterns[i];
    count := count_matches( pattern, sequence );
    tmp := delChars( delChars(pattern,'('), ')' );
    writeln( tmp, ' ', count);
  end;


  //  Replace.
  replace_matches(sequence,new_seq);


  writeln;
  writeln( init_length );
  writeln( clean_length );
  writeln( length(new_seq) );
end.


__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 



More information about the fpc-pascal mailing list