#!/usr/bin/perl -w
# Example 10-2: search_mailfile.pl
# Excerpted from 'Internet Forensics' by Robert Jones
# Published 2005 by O'Reilly Media (ISBN 0-596-10006-X)
my $minLength = 5;
if(@ARGV < 2 or @ARGV > 3) {
die "Usage: $0 <message> <mail file> [<cutoff score>]\n";
}
my $cutoff = -1;
my $mode = 'score';
if(@ARGV == 3) {
$cutoff = $ARGV[2];
$mode = 'select';
}
my %msg0 = ();
my %histogram = ();
open INPUT, "< $ARGV[0]" or die "$0: Unable to open file $ARGV[0]\n";
while(<INPUT>) {
my $block = loadBlock(\%msg0);
}
close INPUT;
open INPUT, "< $ARGV[1]" or die "$0: Unable to open file $ARGV[1]\n";
while(<INPUT>) {
my %msg1 = ();
my $block = loadBlock(\%msg1);
my $score = compareWordSets(\%msg0, \%msg1);
if($mode eq 'score') {
$histogram{$score}++;
} else {
if($score >= $cutoff) {
print "# Score: $score\n";
print "$block\n";
}
}
}
close INPUT;
if($mode eq 'score') {
foreach my $score (sort {$a <=> $b} keys %histogram) {
printf "%-5d %d\n", $score, $histogram{$score};
}
}
sub loadBlock {
my $words = shift;
my $block = '';
my $body = 0;
while(<INPUT>) {
if($body == 0 and /^\s*$/) {
$body = 1;
} elsif($body == 1 and /^From\s/) {
last;
} elsif($body == 1) {
my $line = lc $_;
# fix any quoted-printable encoding
$line =~ s/\=([0-7][0-9a-f])/chr hex $1/ge;
# convert any punctuation to whitespace
$line =~ s/[^a-zA-Z0-9]/ /g;
foreach $word (split /\s+/, $line) {
if(length $word >= $minLength) {
$words->{$word}++;
}
}
}
$block .= $_;
}
$block;
}
sub compareWordSets {
my $msg0 = shift;
my $msg1 = shift;
my $score = 0;
foreach my $word (keys %$msg0) {
if(exists $msg1->{$word}) {
$score++;
}
}
$score;
}
|