[fpc-pascal] Example: regular expressions and "hash-tables"

S. Fisher expandafter at yahoo.com
Fri Mar 29 11:11:54 CET 2013


--- On Wed, 3/20/13, S. Fisher <expandafter at yahoo.com> wrote:

> The program reads a text file and counts the number of
> unique words,
> and also displays the number of times the most common word
> was found.
> 

For comparison, here's a C++ program.  Unlike the Pascal version,
it shows the 20 most common words.  Also, it doesn't use a
regular expression engine, since I didn't have one.

An associative array that maps strings to integers was simply
obtained by:

 unordered_map<string,int> w_table;


// c++  -std=c++11 -o map.exe  map.cpp

#include <unordered_map>
#include <iostream>
#include <fstream>
#include <list>
#include <utility>  // for pair
#include <vector>
#include <algorithm>  // for sort
using namespace std;

// Convert "a-f" to "abcdef".
string gen_cset( string s )
{
  string acc;
  char ch;
  int last = s.size() - 1;
  for (int i = 0; i <= last; i++)
  { ch = s[i];
    if ('-' == ch)
      if ((0 == i) || (last == i))
        acc.push_back( ch );
      else
        for (int j = s[i-1] + 1; j < s[i+1]; j++)
          acc.push_back( j );
    else
      acc.push_back( ch );
  }
  return acc;
}

const string letters = gen_cset( "a-zA-Z" );

list<string> scan_string( string s, string chars )
{
  list<string> lst;
  size_t p, q;
  p = 0;
  while ( p != string::npos )
  { p = s.find_first_of( chars, p );
    if ( p != string::npos )
    { q = s.find_first_not_of( chars, p );
      lst.push_back( s.substr( p, q - p ) );
      p = q;
    }
  }
  return lst;
}

typedef  pair<string,int>  word_and_count;

bool compare_pair( word_and_count a, word_and_count b )
{
  return ( a.second > b.second );
}


int main() {
  unordered_map<string,int> w_table;

  ifstream f_stream ( "Bartlett--Quotations.txt" );
  string line;
  list<string> word_list;
  if ( f_stream.is_open() )
  { while ( f_stream.good() )
    { getline( f_stream, line);
      word_list = scan_string( line, gen_cset("a-zA-Z" ));
      for ( string x: word_list )
        w_table[ x ] += 1;
    }
    f_stream.close();
  }

  cout << "Number of unique words: " << w_table.size() << endl;
  cout << "Most common words:" << endl;

  vector<word_and_count>  pair_vec;
  for ( auto x: w_table )
    pair_vec.push_back( x );

  sort( pair_vec.begin(), pair_vec.end(), compare_pair  );
  word_and_count pr;
  for ( int i = 0; i<20; i++ )
  { pr = pair_vec[i];
    cout << pr.first << " " << pr.second << endl;
  }

  return 0;
}

// Number of unique words: 8911
// Most common words:
// the 2200
// of 1406
// a 1118
// to 1011
// and 975
// in 740
// is 581
// s 510
// Act 484
// Sc 483
// that 454
// The 403
// I 369
// Line 345
// not 333
// his 323
// with 306
// And 303
// be 277
// i 268




More information about the fpc-pascal mailing list