| 
#!/usr/bin/perl -w
# Example 10-2: search_mailfile.pl
# Excerpted from 'Internet Forensics' by Robert Jones
# Published 2005 by O'Reilly Media (ISBN 0-596-10006-X)
my $minLength = 5;
if(@ARGV < 2 or @ARGV > 3) {
   die "Usage: $0 <message> <mail file> [<cutoff score>]\n";
}
my $cutoff = -1;
my $mode = 'score';
if(@ARGV == 3) {
   $cutoff = $ARGV[2];
   $mode = 'select';
}
my %msg0 = ();
my %histogram = ();
open INPUT, "< $ARGV[0]" or die "$0: Unable to open file $ARGV[0]\n";
while(<INPUT>) {
   my $block = loadBlock(\%msg0);
}
close INPUT;
open INPUT, "< $ARGV[1]" or die "$0: Unable to open file $ARGV[1]\n";
while(<INPUT>) {
   my %msg1 = ();
   my $block = loadBlock(\%msg1);
   my $score = compareWordSets(\%msg0, \%msg1);
   if($mode eq 'score') {
      $histogram{$score}++;
   } else {
      if($score >= $cutoff) {
         print "# Score: $score\n";
         print "$block\n";
      } 
   }
}
close INPUT;
if($mode eq 'score') {
   foreach my $score (sort {$a <=> $b} keys %histogram) {
      printf "%-5d  %d\n", $score, $histogram{$score};
   }
}
sub loadBlock {
   my $words = shift;
   my $block = '';
   my $body = 0;
   while(<INPUT>) {
      if($body == 0 and /^\s*$/) {
          $body = 1;
      } elsif($body == 1 and /^From\s/) {
          last;
      } elsif($body == 1) {
          my $line = lc $_;
          # fix any quoted-printable encoding
          $line =~ s/\=([0-7][0-9a-f])/chr hex $1/ge;
          # convert any punctuation to whitespace
          $line =~ s/[^a-zA-Z0-9]/ /g;
          foreach $word (split /\s+/, $line) {
             if(length $word >= $minLength) {
                $words->{$word}++;
             }
          }
      }
      $block .= $_;
   }
   $block;
}
sub compareWordSets {
   my $msg0 = shift;
   my $msg1 = shift;
   my $score = 0;
   foreach my $word (keys %$msg0) {
      if(exists $msg1->{$word}) {
         $score++;
      }
   }
   $score;
}
 |