build_probabilities.pl
#************************************************************
=head1 NAME
build_probabilities.pl - Turn two token files into a probability file
=head1 DESCRIPTION
The following options are supported
=head2 Bad (SPAM) Token File (-b or --bad)
Specify the token file containing the bad tokens.
In other words, the tokens from the SPAM messages.
example: perl -w build_probabilities.pl -p prob.dat -b bad.dat
=head2 Good Token File (-g or --good)
Specify the token file containing the good tokens.
In other words, the tokens from the non-SPAM messages.
example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat
=head2 Help (-h or -?)
Print useage instructions
example: perl -w build_probabilities.pl -h
=head2 Log File Name (-l or --log)
If a logfile is specified, then this is used as the logfile name.
=head2 Log Configuration Files (--log_cfg)
You can create a configuration file for your logger and then configure
your log object by simply telling it to read the specified configuration file.
To create an initial configuration file, write a perl script that
creates a logger, configures the logger, and then use the write_to_file('log_cfg.dat')
method.
This provides complete control over how the logger is configured.
You can set screen and file output levels, for example.
example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat --log_cfg ~andy/logs/default_log.dat
=head2 Log File Directory (--log_dir)
This allows you to specify which directory contains the log
example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat --log_dir ~andy/logs
=head2 Token file (-p or --prob)
This provides a method of specifying the name of the output probability token data file.
example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat
=cut
#************************************************************
use Carp;
use IO::File;
use File::Basename;
use strict;
use Getopt::Long;
use Pitonyak::SmallLogger;
use Pitonyak::SafeGlob qw(glob_spec_from_path);
use Pitonyak::BayesianTokenCounter;
# Print program usage
sub usage {
my $name = $0;
$name = $_[0] if $#_ >= 0;
print STDERR << "EOF";
Usage: $name [-cfhr] [-l file] -b bad_tokens_file -g good_tokens_file -p file [--log_cfg file] [--log_dir path]
Build the probability file
-b, --bad=FILE : file containing the bad tokens
-g, --good=FILE : file containing the good tokens
-h, --help : print this help message
-l, --log=FILE : base name for the log file
--log_cfg=FILE : log configuration file
--log_dir=PATH : path where the logs should be saved
-p, --prob=FILE : output token probability file
example: $name -g good_files.dat -b bad_files.dat -p probability.dat
EOF
}
#************************************************************
#** **
#** Input: configuration file to use **
#** file specs to match **
#** **
#************************************************************
my @suffixlist = ();
my ( $program_name, $program_path, $program_suffix ) =
fileparse( $0, @suffixlist );
my $help = 0;
my $logfile = '';
my $bad_token_file = '';
my $good_token_file = '';
my $outfile = '';
my $log_cfg = '';
my $log_dir = '';
Getopt::Long::Configure("bundling");
my $goodOptions = GetOptions(
"bad|b=s" => \$bad_token_file,
"good|g=s" => \$good_token_file,
"help|?|h" => \$help,
"log|l=s" => \$logfile,
"log_cfg=s" => \$log_cfg,
"log_dir=s" => \$log_dir,
"prob|p=s" => \$outfile,
);
if ( $help
|| $bad_token_file eq ''
|| $good_token_file eq ''
|| $outfile eq '' )
{
usage();
exit 0;
}
my $log = new Pitonyak::SmallLogger;
$log->log_name_date('');
$log->message_loc_format('(sub):(line):');
$log->open_append(1);
$log->log_path($program_path);
$log->read_from_file($log_cfg) if defined($log_cfg) and $log_cfg ne '';
$log->log_path($log_dir) if defined($log_dir) and $log_dir ne '';
$log->log_primary_name($logfile) if defined($logfile) and $logfile ne '';
my $good_tokens =
Pitonyak::BayesianTokenCounter::read_from_file($bad_token_file);
my $bad_tokens =
Pitonyak::BayesianTokenCounter::read_from_file($bad_token_file);
my $token_list = new Pitonyak::BayesianTokenCounter;
$token_list->set_log($log);
$bad_tokens->set_log($log);
$good_tokens->set_log($log);
$token_list->build_probabilities( $good_tokens, $bad_tokens );
$token_list->write_to_file($outfile);
#************************************************************
=pod
=head1 COPYRIGHT
Copyright 1998-2002, Andrew Pitonyak (perlboy@pitonyak.org)
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=head1 Modification History
=head2 September 10, 2002
Version 1.00 First release
=cut
#************************************************************