[fpc-pascal] Example: regular expressions and "hash-tables"
S. Fisher
expandafter at yahoo.com
Fri Mar 29 11:11:54 CET 2013
--- On Wed, 3/20/13, S. Fisher <expandafter at yahoo.com> wrote:
> The program reads a text file and counts the number of
> unique words,
> and also displays the number of times the most common word
> was found.
>
For comparison, here's a C++ program. Unlike the Pascal version,
it shows the 20 most common words. Also, it doesn't use a
regular expression engine, since I didn't have one.
An associative array that maps strings to integers was simply
obtained by:
unordered_map<string,int> w_table;
// c++ -std=c++11 -o map.exe map.cpp
#include <unordered_map>
#include <iostream>
#include <fstream>
#include <list>
#include <utility> // for pair
#include <vector>
#include <algorithm> // for sort
using namespace std;
// Convert "a-f" to "abcdef".
string gen_cset( string s )
{
string acc;
char ch;
int last = s.size() - 1;
for (int i = 0; i <= last; i++)
{ ch = s[i];
if ('-' == ch)
if ((0 == i) || (last == i))
acc.push_back( ch );
else
for (int j = s[i-1] + 1; j < s[i+1]; j++)
acc.push_back( j );
else
acc.push_back( ch );
}
return acc;
}
const string letters = gen_cset( "a-zA-Z" );
list<string> scan_string( string s, string chars )
{
list<string> lst;
size_t p, q;
p = 0;
while ( p != string::npos )
{ p = s.find_first_of( chars, p );
if ( p != string::npos )
{ q = s.find_first_not_of( chars, p );
lst.push_back( s.substr( p, q - p ) );
p = q;
}
}
return lst;
}
typedef pair<string,int> word_and_count;
bool compare_pair( word_and_count a, word_and_count b )
{
return ( a.second > b.second );
}
int main() {
unordered_map<string,int> w_table;
ifstream f_stream ( "Bartlett--Quotations.txt" );
string line;
list<string> word_list;
if ( f_stream.is_open() )
{ while ( f_stream.good() )
{ getline( f_stream, line);
word_list = scan_string( line, gen_cset("a-zA-Z" ));
for ( string x: word_list )
w_table[ x ] += 1;
}
f_stream.close();
}
cout << "Number of unique words: " << w_table.size() << endl;
cout << "Most common words:" << endl;
vector<word_and_count> pair_vec;
for ( auto x: w_table )
pair_vec.push_back( x );
sort( pair_vec.begin(), pair_vec.end(), compare_pair );
word_and_count pr;
for ( int i = 0; i<20; i++ )
{ pr = pair_vec[i];
cout << pr.first << " " << pr.second << endl;
}
return 0;
}
// Number of unique words: 8911
// Most common words:
// the 2200
// of 1406
// a 1118
// to 1011
// and 975
// in 740
// is 581
// s 510
// Act 484
// Sc 483
// that 454
// The 403
// I 369
// Line 345
// not 333
// his 323
// with 306
// And 303
// be 277
// i 268
More information about the fpc-pascal
mailing list