#!/usr/bin/perl -w
#
# parp -- Perl Anti-spam Replacement for Procmail
#
# Copyright (c) 1999--2000 Adam Spiers <adam@spiers.net>. All rights
# reserved. This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# $Id: parp,v 1.91 2001/05/07 14:33:06 adam Exp $
#

use strict;

my $VERSION = '0.44';

use Carp qw(:DEFAULT confess);
use DB_File;
use Data::Dumper;
use Fcntl qw(:DEFAULT :flock);
use Getopt::Std;
use Mail::Internet;
use Mail::Address;
use Mail::Filter;
use Mail::Folder;
use Mail::Folder::Mbox;

use lib "$ENV{HOME}/.parp";
use MyFilter qw(%CONFIG %RE);

local $SIG{__DIE__} = \&die_handler;

# Process options
my %opts = ();
getopts('edfhmprs:tvw', \%opts);
usage() if $opts{h};
my ($no_dups, $extract_friends, $filter_files, $Mail_Folder, $dry_pipes,
    $do_RBL, $sample, $test_run, $verbose, $wrong_class)
  = @opts{qw/d e f m p r s t v w/};
$extract_friends ||= 0;
$test_run        ||= 0;
$wrong_class     ||= 0;
usage() if ($extract_friends + $test_run + $wrong_class) > 1;
$test_run++ if $extract_friends;
my $Net_DNS_loaded = 0;
if ($do_RBL) {
  eval {
    require Net::DNS;
  } && $Net_DNS_loaded++;
}

# Share some stuff between diffrent packages
*Mail::Filterable::LOG          = \*LOG;
*Mail::Filterable::vprint       = \*vprint;
*MyFilter::log_to_file          = \*log_to_file;
*fatal          = \*Mail::Filterable::fatal;
*check_file_dir = \*Mail::Filterable::check_file_dir;

# Prepare the filter
my $filter = new Mail::Filter(\&filter);

# Prepare for output
$| = 1;
unless ($test_run) {
  die "No log_file specified in %CONFIG\n" unless $CONFIG{log_file};
  check_file_dir($CONFIG{log_file});
  open(LOG, ">>$CONFIG{log_file}")
    or die "Couldn't open log file `$CONFIG{log_file}' for writing: $!\n";
}
sub vprint (@);
sub log_to_file (@);
my $WIDTH = 78;
my %out_folders = ();

# Get a lock as soon as we mean business
global_lock();

# Prepare friends database
use vars qw(%friends);
%friends = ();
if (exists $CONFIG{friends_db}) {
  check_file_dir($CONFIG{friends_db});
  tie %friends, 'DB_File', $CONFIG{friends_db}, O_RDWR | O_CREAT, 0600;
}

use vars qw(@dup_ids);
@dup_ids = ();
if (exists $CONFIG{id_cache}) {
  tie @dup_ids, 'DB_File', $CONFIG{id_cache}, O_CREAT | O_RDWR, 0600,
      $DB_RECNO;
}


##
## This is the core of the program.
##

if (! $filter_files) {
  #
  # Behave like a filter; take one e-mail from STDIN
  #

  usage() if -t;
  usage() if @ARGV;

  my $mail = new Mail::Internet( [<>] );
  $filter->filter($mail);
  log_to_file '-' x $WIDTH, "\n";
}
else {
  #
  # Filter all folders given in @ARGV
  #

  usage() unless @ARGV;

  log_to_file "###\n### Run started at ", localtime(), "\n###\n\n";

  my ($total, $parsed, $spam, $dups, $main, $aux, $special)
    = (0, 0, 0, 0, 0, 0, 0);

  my %inodes_seen = ();

  foreach my $file (@ARGV) {
    unless (-f $file) {
      # TODO: allow symlinks
      vprint "Skipping non-file $file.\n";
      next;
    }

    my ($inode, $size) = (stat $file)[1, 7];

    if ($size == 0) {
      vprint "Skipping empty file $file.\n";
      next;
    }

    if ($inodes_seen{$inode}) {
      vprint "Skipping $file\n" .
             "  (already seen file $inodes_seen{$inode} with inode $inode\n";
      next;
    }
    $inodes_seen{$inode} = $file;

    vprint "Reading $file ... ";
    my $folder = new Mail::Folder('AUTODETECT', $file);
    vprint "done.\nSetting $file read-only ... ";
    $folder->set_readonly();
    vprint "done.\n";

    my $msg_num = $folder->first_message();
    my ($file_total, $friends) = (0, 0);

    do {
      $total++;
      my $mail = $folder->get_message($msg_num);
      $mail->{parp_foldername} = $file;
      if (! $mail) {
        fatal('Mail::Folder::get_message failed',
              "\$mail:\n", Dumper $mail,
             );
        next;
      }
      my $rv = $filter->filter($mail);
      $parsed++  if $rv;
      $dups++    if $rv =~ /IS_DUPLICATE/;
      $spam++    if $rv =~ /IS_SPAM/;
      $main++    if $rv =~ /TO_MAIN/;
      $aux++     if $rv =~ /TO_AUX/;
      $special++ if $rv =~ /IS_SPECIAL/;
      $friends++ if $rv eq 'EXTRACTED_FRIEND';
      $file_total++;
      log_to_file '-' x $WIDTH, "\n";
    }
    while ($msg_num = $folder->next_message($msg_num)) &&
          (
           ((! $sample)          || ($file_total < $sample)) ||
           ((! $extract_friends) || ($friends == 0))
          );
  }

  log_to_file <<EOF;
Parsed $parsed of $total messages:
  delivered $main to $CONFIG{main_folder}
  delivered $aux to auxiliary folders
  tagged $spam as spam
  discarded $dups as duplicate
  tagged $special as special
EOF
  log_to_file "Run ended at ", localtime(), "\n";
  log_to_file '=' x $WIDTH, "\n";
}

##
## Clean up and exit.
##

END {
  close(LOG) unless $test_run;

  # Avoid `1 inner references still exist' warnings
  my $tied = (tied %friends) ? 1 : 0;
  $tied and untie %friends;

  foreach my $name (keys %out_folders) {
    my $folder = $out_folders{$name};
    if (ref($folder) eq 'Mail::Folder') {
      vprint "Syncing $name ... ";
      $folder->sync();
      vprint "done.\n";

      vprint "Closing $name ... ";
      $folder->close();
      vprint "done.\n";
    }
    else {
      close($folder);
    }
  }

  global_unlock();
}

exit 0;


##############################################################################
#
# The main filtering logic.
#

sub filter {
  my ($filter, $mail) = @_;

  my $folder;
  $folder = $mail->{parp_foldername} if $mail->{parp_foldername};

  if (! $mail) {
    fatal('message parsing failed',
          "\$folder:\n", Dumper($folder),
          "\n",
          "\$mail:\n", Dumper($mail),          
         );
    return 0;
  }

  my $m = new Mail::Filterable $mail;

  vprint "Parp-ID: $m->{parp_id}\n";

  log_to_file <<EOF;
From: $m->{from}
To: $m->{to}
EOF

  log_to_file "Cc: $m->{cc}\n" if $m->{cc};
  log_to_file "Subject: $m->{subject}\n";

  if ($m->{parp_id}) {
  }
  elsif (1) {
    fatal("Parp-ID not defined",
          "\$m:\n", Dumper($m),
         );
  }
  # the following cases should never happen
  elsif ($m->{id} && $m->{id} ne '<>') {
    vprint "Message-ID: $m->{id}\n";
  }
  elsif ($m->{date}) {
    vprint "Date: $m->{date}\n";
  }
  elsif ($m->{subject}) {
    vprint "Subject: $m->{subject}\n";
  }
  elsif ($m->{from}) {
    vprint "From: $m->{from}\n";
  }
  else {
    vprint "From $m->{env_from}\n";
  }
  log_to_file "\n";

  return $m->extract_friends($folder) if $extract_friends;

  if ($no_dups && ! $wrong_class && $m->is_duplicate()) {
    $m->reject_mail('was duplicate by message id');
#   $m->deliver_to_inbox('duplicates');
    $m->{backup} = 0;
    return 'IS_DUPLICATE';
  }

  $m->check_for_old_addresses();

  # FIXME: There could be more than one X-Loop header.
  if (($m->{header}->get('X-Loop') || '') eq $CONFIG{loop_value} and
     ! $wrong_class) {
    $m->accept_mail('looped');
    return 'LOOPED';
  }

  $m->{filter_category} = $m->categorize();

  if (! $wrong_class) {
    $m->parse_received_headers();

    if ($m->{recvd_parses_failed}) {
      if ($m->{filter_category} eq 'IS_SPAM') {
        $m->deliver_to('spam_recvds');
      }
      else {
        vprint $m->{recvd_parses_out};
        $m->deliver_to('bad_recvds');
      }
    }

    if ($m->{filter_category} eq 'TO_MAIN') {
      $m->deliver_mail();
    }
    elsif ($m->{filter_category} eq 'IS_SPAM') {
      if ($m->{complain}) {
        # TODO: write and send a rude letter
        log_to_file "Would complain\n";
      }
    }
    elsif ($m->{filter_category} eq 'TO_AUX') {
      # list mail; already delivered to primary target
      #  - maybe back up though
      $m->maybe_backup();
    }
    elsif ($m->{filter_category} eq 'IS_SPECIAL') {
      # special case mail; already delivered to primary target
      #  - maybe back up though
      $m->maybe_backup();
    }
    else {
      die "Oh dear.";
    }
  }
  else {
    # The user's telling us that the filter_category we've just
    # calculated is wrong.
    if ($m->{filter_category} eq 'IS_SPAM') {
      vprint "Reclassification: was incorrectly identified as spam\n";
      $m->{filter_category} = 'UNKNOWN_NOT_SPAM';
    }
    elsif ($m->{filter_category} ne 'IS_SPAM') {
      vprint "Reclassification: was incorrectly identified as bona-fide\n";
      $m->{filter_category} = 'IS_SPAM';
    }
  }

  return $m->{filter_category};
}


##############################################################################
#
# Miscellaneous routines.
#

sub log_to_file (@) {
  Mail::Filterable::log_to_file(@_);
}

sub vprint (@) {
  # Deal with messages to be printed/logged when user specifies -v
  my (@msgs) = @_;
  if ($verbose) {
    print @msgs unless $test_run;
  }
  log_to_file @msgs;
}

sub global_lock {
  die "No lock_file specified in %CONFIG\n" unless $CONFIG{lock_file};
  check_file_dir($CONFIG{lock_file});

  if (! -e $CONFIG{lock_file}) {
    unless (open(LOCK, ">$CONFIG{lock_file}")) {
      fatal("Couldn't create lock file $CONFIG{lock_file}: $!");
      exit 3;
    }
  }
  else {
    unless (open(LOCK, $CONFIG{lock_file})) {
      fatal("Couldn't open lock file $CONFIG{lock_file}: $!");
      exit 4;
    }
  }

  my $wait = 0;
  until (flock LOCK, LOCK_EX | LOCK_NB) {
    vprint "\n" if $wait;
    vprint "Waiting for exclusive lock on $CONFIG{lock_file} ... ";
    $wait++;
    sleep 3;
  }
  vprint "got it!\n" if $wait;
}

sub global_unlock {
  # Don't do anything; LOCK_UN introduces races.
}

sub usage {
  warn <<EOF;
parp $VERSION -- Perl Anti-spam Replacement for Procmail
(c) 1999--2000 Adam Spiers <adam\@spiers.net>

Usage:
  parp [ options ] < email
  parp [ options ] -f folder1 [ folder2 ... ]

Options:
  -d       enable discarding of duplicates (by Message-Id header)
  -e       only extract e-mail addresses and add them to friends database
  -f       operate on given files rather than as a filter
  -m       use Mail::Folder rather than >> for appending (much slower)
  -p       dry run for pipes - don't actually execute pipe commands
  -r       enable RBL checking
  -s<num>  with -f, only sample a maximum of <num> messages per folder
  -t       test run - just show what would have been done
  -v       increase verbosity
  -w       only log that filter's spam detection heuristics failed on
           the supplied mails; don't do anything else
EOF

  exit 2;
}

sub die_handler {
  my ($error) = @_;

  fatal($error, "Called via DIE handler\n");
  exit 255;
}


##############################################################################
#
# Routines for parsing the e-mail being filtered and calculating various
# bits of data which will be used in the filtering process.
#

package Mail::Filterable;

use Data::Dumper;
use POSIX qw(tmpnam);
use File::Path;
use Time::Local;
use Digest::MD5 qw(md5_base64);
use Socket;

use Mail::Field::Received;
use MyFilter qw(%CONFIG %RE %lists
                &is_special &is_list_mail &is_from_daemon
                &deliver_mail);

use subs qw(vprint log_to_file);

local $SIG{__DIE__} = \&::die_handler;

sub new {
  my $this = shift;
  my $class = ref($this) || $this;

  my ($mail, $props_hashref) = @_;

  my %m = ();

  my $header = $mail->head();
  my $body = $mail->body;
  $m{content_type} = $header->get('Content-Type') || '';

  # Deal with MIME multipart messages without using a very slow
  # parser from CPAN ...
  if ($m{content_type} =~ m!^multipart/.*boundary=(.*)\n!s) {
    my $boundary = $1;
    $boundary =~ s/^"(.*)"$/$1/;
    $boundary = quotemeta $boundary;
    log_to_file qq{Message is multipart; splitting on boundary "$1".\n};

    my @parts = split /--$boundary(?:--)?\n?/m, join('', @$body);

    log_to_file "Deleting non-text parts ... \n";
    my @body_lines = ();
    foreach my $part (@parts) {
      my @lines = split /(?<=\n)/, $part;
      my $part_mail = new Mail::Internet(\@lines);
      next unless @lines;
      my $content_type = $part_mail->get('Content-Type');
      if ($content_type) {
        chomp $content_type;
        log_to_file "Content-Type: $content_type";

        if ($content_type !~ m!^text/\b!) {
          log_to_file "; skipping ...\n";
          next;
        }
        else {
          log_to_file "\n";
        }
      }
      else {
        $content_type = '_unspecified_';
        log_to_file "Warning: Content-Type was unspecified; assuming plain text.\n";
      }

      push @body_lines, @{ $part_mail->body() };
    }

    $body = \@body_lines;
  }

  $m{mail}   = $mail;
  $m{header} = $header;
  $m{body}   = $body;
  $m{body_scalar} = join '', map { s/^>From/From/; $_ } @{$m{body}};

  # envelope from
  $header->mail_from('KEEP');
  $m{mail_from}       = $header->get('From ')          ||
                        $header->get('Mail-From')      || '';

  $m{parp_id}         = $header->get('X-Parp-Id')      || '';

  # Don't fold `From ' header, as we want that to match a date regexp below
  $header->fold(79);

  $m{from}            = $header->get('From')           || '';
  $m{to}              = $header->get('To')             || '';
  $m{cc}              = $header->get('Cc')             || '';
  $m{subject}         = $header->get('Subject')        || '';
  $m{return_path}     = $header->get('Return-Path')    || '';
  $m{reply_to}        = $header->get('Reply-To')       || '';
  $m{list}            = $header->get('X-Mailing-List') || '';
  $m{sender}          = $header->get('Sender')         || '';
  $m{in_reply_to}     = $header->get('In-Reply-To')    || '';
  $m{references}      = $header->get('References')     || '';
  $m{id}              = $header->get('Message-ID')     || '';
  $m{date}            = $header->get('Date')           || '';
  $m{status}          = $header->get('Status')         || '';
  $m{a_to}            = $header->get('Apparently-To')  || '';
  $m{mailer}          = $header->get('User-Agent')     ||
                        $header->get('X-Mailer')       || '';

  # From RFC822:
  #
  # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< ---------
  #    4.2.  FORWARDING
  #  
  #         Some systems permit mail recipients to  forward  a  message,
  #    retaining  the original headers, by adding some new fields.  This
  #    standard supports such a service, through the "Resent-" prefix to
  #    field names.
  #  
  #         Whenever the string "Resent-" begins a field name, the field
  #    has  the  same  semantics as a field whose name does not have the
  #    prefix.  However, the message is assumed to have  been  forwarded
  #    by  an original recipient who attached the "Resent-" field.  This
  #    new field is treated as being more recent  than  the  equivalent,
  #    original  field.   For  example, the "Resent-From", indicates the
  #    person that forwarded the message, whereas the "From" field indi-
  #    cates the original author.
  #  
  #         Use of such precedence  information  depends  upon  partici-
  #    pants'  communication needs.  For example, this standard does not
  #    dictate when a "Resent-From:" address should receive replies,  in
  #    lieu of sending them to the "From:" address.
  #  
  #    Note:  In general, the "Resent-" fields should be treated as con-
  #           taining  a  set  of information that is independent of the
  #           set of original fields.  Information for  one  set  should
  #           not  automatically be taken from the other.  The interpre-
  #           tation of multiple "Resent-" fields, of the same type,  is
  #           undefined.
  # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< ---------
  #
  # So we only take values from Resent- headers when we can't get them
  # any other way but we really would prefer to have them.

  my %resent_headers = ( id => 'Message-ID' );
  foreach my $header_key (keys %resent_headers) {
    my $header_name = $resent_headers{$header_key};
    $m{$header_key} ||= $header->get("Resent-$header_name") || '';
  }

  foreach my $prop (qw/mail_from parp_id from to cc subject
                       return_path reply_to list
                       in_reply_to references id date status a_to mailer/)
  {
    chomp $m{$prop} if $m{$prop};
  }

  if (! $m{mail_from}) {
    fatal(<<QMAIL);
Envelope From header missing.  If you are using qmail as your MTA,
make sure your .qmail setup passes the mail through the preline filter
before being passed to parp, e.g.

  | preline /path/to/parp -dr

Otherwise, parp may not be compatible with your MTA.  Does it deliver
in mbox format?
QMAIL
  }

  if ($m{mail_from} !~
      /(.*?)\s*(\w{3}) (\w{3}) ([\d ]\d) (\d\d):(\d\d):(\d\d) (\d{4})$/) {
    fatal("Envelope From header had weird date format:\n$m{mail_from}",
          "\%m:\n", Dumper(\%m),
         );
    exit 6;
  }
  $m{env_from} = $1;
  my ($dow, $month, $mday, $hour, $min, $sec, $year) =
     ($2, $3, $4, $5, $6, $7, $8);
  my $i = 0;
  my %months = map { $_ => $i++ }
                   qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
  $m{env_from_time}    = timelocal($sec, $min, $hour,
                                   $mday, $months{$month}, $year);

  @{$m{ftc}}           = extract_addrs_to_array(@m{qw/env_from from
                                                     to cc list sender/});
  $m{from_addr}        = extract_addr_to_scalar($m{from});
  $m{env_from_addr}    = extract_addr_to_scalar($m{env_from});
  @{$m{from_addrs}}    = extract_addrs_to_array(@m{qw/env_from from/});
  @{$m{all_from_addrs}}
                       = extract_addrs_to_array($m{env_from}, $m{from},
                                                $m{reply_to}, $m{return_path},
                                                $m{id});

  $m{recvds_array}    = [ $header->get('Received') ];
  $m{recvds}          = join '', @{$m{recvds_array}};

  my @comments        = $header->get('Comments');
  $m{comments}        = \@comments;

  $m{auth_sender}     = '';
  foreach my $comment (@comments) {
    if ($comment =~ /^Authenticated sender is (.*)/i) {
      $m{auth_sender} = $1;
      last;
    }
  }

  # Remove all previously existing parp headers except X-Parp-Id
  my @parp_headers = grep /^X-Parp-(?!Id)/, $m{header}->tags();
  foreach my $parp_header (@parp_headers) {
    $m{header}->delete($parp_header);
  }

  # Add a header for the process id to try to chase down obscure bugs.
  $m{header}->add('X-Parp-pid', $$);
  system("/bin/date >> ~/mail/.parp.pstree");
  system("pstree >> ~/mail/.parp.pstree");

  # Calculate a unique id which parp can always refer to.  We
  # calculate the MD5 digest of enough of the whole mail to ensure a
  # unique id, but without any bits which might change in some way
  # during the e-mail's life-span, so as to guarantee that during any
  # subsequent reclassification of the e-mail as a false
  # positive/negative (in the spam detection sense) this unique id
  # will match with the original, so that the statistics calculation
  # program will work.
  
  my $immutable_header =
    join '', map "$_: $m{$_}\n", (qw/mail_from from to cc subject
                                     return_path reply_to in_reply_to
                                     references id date mailer recvds/);
  
  my $immutable = $immutable_header .
                  "\n" .
                  $m{body_scalar};
  $immutable =~ s/\n+$/\n/;

  my $parp_id = $m{env_from_time} . "/" . md5_base64($immutable);

  # This one was a PAIN to get right.  I don't feel safe enough yet to
  # remove the debugging.
  my $immutables_dir = "$ENV{HOME}/mail/.immutables";
  if (-d $immutables_dir) {
    my $id_file = $parp_id;
    $id_file =~ s!/!_!g;
    $id_file = "$immutables_dir/$id_file";
    
    while (-e $id_file) {
      # Generate a unique suffix
      $id_file =~ s/(?:\.(\d+))?$/"." . (($1 || 0) + 1)/e;
    }
    
    if (open(FOO, ">$id_file")) {
      print FOO $immutable;
      close(FOO);
    }
    else {
      fatal("Couldn't open $id_file for writing: $!");
    }
  }
  
  if ($m{parp_id}) {
    # This e-mail has already been run through parp, so it already has
    # an X-Parp-Id header.

    if ($m{parp_id} ne $parp_id) {
      # Better check that the id we've just calculated is the same,
      # otherwise our calculation algorithm is in trouble.
      fatal("Message already had a parp id of: $m{parp_id}\n" .
            "       but recalculation yielded: $parp_id",
            #"\%m:\n", Dumper(\%m),
           );
    }
  }
  else {
    # This e-mail hasn't been touched by parp before, so stamp it with
    # a parp id.
    $m{header}->add('X-Parp-Id', $parp_id);
    $m{parp_id} = $parp_id;
  }

  $m{backup}   = 1;     # back up by default
  $m{complain} = 1;     # allow complaining by default

  my $self = \%m;
  bless $self, $class;
}

sub extract_addrs_to_array {
  my (@lines) = @_;

  my @addrs = ();
  foreach my $line (@lines) {
    my @new_addrs = Mail::Address->parse($line);
    push @addrs, map { $_->address() } @new_addrs;
  }

  return @addrs;
}

sub extract_addr_to_scalar {
  my ($line) = @_;

  my @addrs = ();
  my @new_addrs = Mail::Address->parse($line);
  push @addrs, map { $_->address() } @new_addrs;

  fatal("header passed to extract_addr_to_scalar had " .
        "more than one address\n",
        map("  $_\n", @addrs),
        "line: $line\n",
       )
    if @addrs > 1;

  return @addrs ? $addrs[0] : undef;
}


##############################################################################
#
# Routines for performing tests on the e-mail being filtered, and
# categorizing it accordingly.
#

use DB_File;
use Fcntl qw(:DEFAULT);

sub matches {
  my $m = shift;
  my ($category, $re, $debug) = @_;

  foreach my $addr (@{$m->{$category}}) {
    print "Testing $addr =~ /$re/\n" if $debug;
    my $matches = $addr =~ $re;
    print "  -- matched!\n" if $debug and $matches;
    return ($matches, $1, $2, $3, $4, $5, $6, $7, $8, $9) if $matches;
  }

  return;
}


# This subroutine is the heart of the filtering strategy.  It places
# the mail currently being filtered into one of the following
# categories:
# 
#   IS_SPAM    -- spam
#   TO_MAIN    -- good mail destined for main inbox
#   TO_AUX     -- good mail destined for auxiliary inboxes
#   SPECIAL    -- leave subroutines to do their own thang

sub categorize {
  my $m = shift;

  # Ignore real lusers.
  return 'IS_SPAM' if $m->has_spam_from_addresses();

  # Allow config file to deal with special cases in its own way.
  my $rv = $m->is_special();
  return $rv if $rv;

  return 'TO_MAIN' if $m->is_from_daemon();

  # Any good signs which indicate that the mail should definitely
  # NOT be treated as junk?
  my $grace = $m->is_passworded()        ||
              $m->is_from_good_person()  ||
              $m->is_from_good_domain()  ||
              $m->has_good_headers();

  return 'IS_SPAM'
    if ! $grace &&
       ($m->has_spam_headers()          ||
        $m->has_spam_domains_anywhere() ||
        $m->has_spam_content());

  if ($m->is_list_mail()) {
    $m->{complain} = 0;
    return 'TO_AUX';
  }

  return 'TO_MAIN' if $grace;

  # We put this one after the check for list mail, because on average,
  # mail from lists tends to be lower grade than personal mail.
  return 'IS_SPAM' if $m->has_suspicious_headers();

  return 'IS_SPAM' if ! $m->for_me();

  $m->accept_mail('passed all tests');
  return 'TO_MAIN';
}

sub extract_friends {
  my $m = shift;
  my ($folder) = @_;

  my $folder_name = $folder ? qq.`$folder'. : 'unknown';

  my %addrs = (
#              env_from    => { descr => 'envelope From' },
               from        => { descr => 'From'          },
               to          => { descr => 'to'            },
               cc          => { descr => 'cc'            },
               reply_to    => { descr => 'Reply-To'      },
#              return_path => { descr => 'Return-Path'   },
              );

  foreach my $addr_type (keys %addrs) {
    $addrs{$addr_type}{$_} ||= 0 foreach qw/me not_me total/;
    next unless $m->{$addr_type};
    my @addrs = Mail::Address->parse($m->{$addr_type});
    foreach my $parsed (@addrs) {
      my $paddr = $parsed->address();
      if ($paddr =~ $RE{me} || $paddr =~ $RE{old_me}) {
        $addrs{$addr_type}{me}++;
      } else {
        $addrs{$addr_type}{not_me}++;
      }
      $addrs{$addr_type}{total}++;
      push @{$addrs{$addr_type}{addrs}}, $paddr;
    }
  }

# for my $type (qw/from to cc reply_to/) {
#   vprint "type $type: ";
#   for my $count (qw/me not_me total/) {
#     vprint "[$count $addrs{$type}{$count}]";
#   }
#   vprint "\n";
# }

  my @maybe_new_friends = ();

  if ($addrs{from}{me}  == 1 &&
      $addrs{to}{total} == 1 &&
      $addrs{cc}{total} == 0)
  {
    push @maybe_new_friends, { addr => $addrs{to}{addrs}[0],
                               header => 'to' };
    log_to_file "Found friend in `To:' header.\n";
  }
  elsif ( # $addrs{to}{me} + $addrs{cc}{me} >= 1 && # could be on a list
         $addrs{from}{not_me} == 1)
  {
    push @maybe_new_friends, { addr => $addrs{from}{addrs}[0],
                               header => 'From' };
    push @maybe_new_friends, { addr => $addrs{reply_to}{addrs}[0],
                               header => 'Reply-To' }
      if $addrs{reply_to}{total} == 1;
    log_to_file "Found friend in `From' and `Reply-To:' headers.\n";
  }

  my $added = 0;
  foreach my $maybe_new_friend (@maybe_new_friends) {
    next if $::friends{$maybe_new_friend->{addr}};
    vprint "Adding `$maybe_new_friend->{addr}' to friends database ... \n";
    my $source = "friend extracted from `$maybe_new_friend->{header}' " .
                 "header of message";
    if ($m->{parp_id}) {
      $source .= " parp id $m->{parp_id}";
    }
    elsif ($m->{id}) {
      $source .= " id `$m->{id}'";
    }
    elsif ($m->{date}) {
      $source .= " dated $m->{date}";
    }
    $source .= " in $folder_name folder";
    $m->make_friend($maybe_new_friend->{addr}, $source);
    $added++;
  }
  return $added ? q[EXTRACTED_FRIEND] : q[DIDN'T_EXTRACT_FRIEND];
}

sub is_duplicate {
  my $m = shift;

  my $found = 0;

  # Ugh.  Wish there were tie-able dual array/hash data structures.
  # TODO: Maybe there are.  Find out.
  foreach my $cached_id (@::dup_ids) {
    if ($m->{id} eq $cached_id) {
      $found++;
      last;
    }
  }

  return 1 if $found;

  push @::dup_ids, $m->{id};
  shift @::dup_ids if @::dup_ids > $CONFIG{max_cache_ids};
  log_to_file "Added id to duplicates cache.\n";
  return 0;
}

sub check_for_old_addresses {
  my $m = shift;

  my $found = 0;

  if ($m->{to} =~ $RE{old_me}) {
    log_to_file "*** Old address found:\n  ", $m->{to}, "\n";
    $found++;
  }

  if ($m->{cc} =~ $RE{old_me}) {
    log_to_file "*** Old address found:\n  ", $m->{cc}, "\n";
    $found++;
  }

  $m->deliver_to_inbox('old_addresses') if $found;
}

sub is_passworded {
  my $m = shift;
  if ($m->{subject} =~ /$CONFIG{password}/ or
      ($m->{header}->get($CONFIG{password_header}) || '')
        =~ /$CONFIG{password}/)
  {
    $m->accept_mail('contains good password');
    $m->make_friend($m->{from_addr}, 'gave password');
    return 1;
  }

  return 0;
}

sub make_friend {
  my $m = shift;
  my ($address, $reason) = @_;

  $::friends{$address} = $reason;
}

sub is_from_good_domain {
  my $m = shift;

  if ($m->{from}     =~ $RE{good_domains} &&
      ($m->{env_from} =~ $RE{good_domains} ||
       $m->{id}       =~ $RE{good_domains} ||
       $m->{sender}   =~ $RE{good_domains}))
  {
    my $good_domain = $1;
    $m->accept_mail('good domain', $good_domain);
    return 1;
  }

  return 0;
}

sub has_good_headers {
  my $m = shift;

  # Could cross-check In-Reply-To: with good domains, but
  # no spammers seem to be setting this header yet, which
  # makes it an even more powerful test.
  if ($m->{in_reply_to}) {
    $m->accept_mail('had In-Reply-To: header');
    return 1;
  }

  if ($m->{references} =~ $RE{good_domains}) {
    $m->accept_mail('References: had good domain', $1);
    return 1;
  }

  if ($m->{subject} =~ $RE{subject_buzzwords}) {
    $m->accept_mail('subject had buzzword', $1);
    return 1;
  }

  if ($m->{mailer} =~ /(mozilla.*linux)/i) {
    $m->accept_mail('good X-Mailer header', $1);
    return 1;
  }

  return 0;
}

sub is_from_good_person {
  my $m = shift;

  if (tied %::friends) {
    foreach my $addr (@$m{qw/from_addr env_from_addr/}) {
      if (exists $::friends{$addr}) {
        $m->accept_mail('from friend', "`$addr' -- $::friends{$addr}");
        return 1;
      }
    }
  }

  return 0;
}

sub has_spam_headers {
  my $m = shift;

  # Many thanks to Mark-Jason Dominus and to the authors of junkfilter
  # and the NAGS filter for some of the ideas contained herein.

  my $octet_RE = '([12]?\d\d|\d\d|\d)';
  my $ipv4_RE  = ("$octet_RE\\." x 3) . $octet_RE;
  my $foo_RE   = qr![\w.%\#\$+-/]+\*?!;
  if ($m->{id} !~ m/^<
                      ($foo_RE|"$foo_RE")
                      \@
                      (
                       [\w-]+ (\. [\w-]+){0,6}         |
                       \[ $ipv4_RE \]
                      )
                     >/x) {
    $m->reject_junk_mail('invalid Message-ID: header', "`$m->{id}'");
    return 1;
  }

  if (my @m = $m->matches('ftc', $RE{decoys})) {
    $m->{complain} = 0; # don't let them wise up to me subscribing to
                        # stuff using a dud address
    $m->reject_junk_mail('not sent to a proper address', $m[1] || undef);
    return 1;
  }

  foreach my $bad_header (qw/PMFLAGS Advertisement X-Advertisement X-Shock/) {
    if ($m->{header}->get($bad_header)) {
      $m->reject_junk_mail('found bad header', $bad_header);
      return 1;
    }
  }

  my $uidl = $m->{header}->get('X-UIDL') || '';
  chomp $uidl;
  if ($uidl and $uidl !~ /^([0-9a-f]{32}|.{20})$/i) {
    $m->reject_junk_mail('invalid X-UIDL: header', "`$uidl'");
    return 1;
  }

  if ($m->{status} =~ /MC/i) {
    $m->reject_junk_mail('MaxAnnon! mailer');
    return 1;
  }

  if (($m->{header}->get('X-Distribution') || '') =~ /mass/i) {
    $m->reject_junk_mail('bulk mail sent with Pegasus');
    return 1;
  }

  if ($m->{from} =~ /^(<(_?\@_)?>)$/) {
    $m->reject_junk_mail("bad From: header", "`$1'");
    return 1;
  }

  if ($m->{return_path} =~ /^(<(_?\@_)?>)$/) {
    $m->reject_junk_mail("bad Return-Path: header", "contained `$1'");
    return 1;
  }

  if ($m->{from} eq '') {
    $m->reject_junk_mail('From: header is blank or missing');
    return 1;
  }

  if (($m->{subject} =~ tr/\x80-\xff//) > 3) {
    $m->reject_junk_mail('Subject: header had too many 8-bit characters');
    return 1;
  }

  if ($m->{date} =~ m![^\w:,()+/ \t-]!) {
    $m->reject_junk_mail('bad Date: header', "`$m->{date}'");
    return 1;
  }

  if ($m->{recvds} =~ /(-0600 \(EST\)|-0[57]00 \(EDT\))/) {
    $m->reject_junk_mail('bad Received: header date', "`$1'");
    return 1;
  }

  if ($m->{mailer} =~ $RE{bad_words}) {
    $m->reject_junk_mail("bad X-Mailer: header", "contained `$1'");
    return 1;
  }

  if ($m->{recvds} =~ $RE{bad_words}) {
    $m->reject_junk_mail("bad Received: header", "contained `$1'");
    return 1;
  }

  my $organisation = $m->{header}->get('Organisation') ||
                     $m->{header}->get('Organization') ||
                     '';
  if ($organisation =~ $RE{bad_words}) {
    $m->reject_junk_mail("bad organisation header", "contained `$1'");
    return 1;
  }

  if (@{$m->{ftc}} > $CONFIG{max_recipients}) {
    $m->reject_junk_mail('too_many_recipients');
    return 1;
  }

  if ($m->{from_addr} =~ /(\@{2,})/) {
    $m->reject_junk_mail('bad From: address', "contained `$1'");
    return 1;
  }
}

sub has_suspicious_headers {
  my $m = shift;

  if ($m->{to} eq '' and $m->{cc} eq '') {
    $m->reject_junk_mail('To: and Cc: headers both blank or missing');
    return 1;
  }

  if ($m->{to} eq '') {
    $m->reject_junk_mail('To: header blank or missing');
    return 1;
  }

  if ($m->{to} =~ $RE{bad_to}) {
    $m->reject_junk_mail("bad To: header", "contained `$1'");
    return 1;
  }

  if ($m->{subject} =~ $RE{bad_subjects}) {
    $m->reject_junk_mail("bad Subject: header", "contained `$1'");
    return 1;
  }

  if (($m->{subject} =~ tr/!/!/) >= 5 ||
      $m->{subject} =~ /!!!!/) {
    $m->reject_junk_mail("Subject: header contained too many exclamation marks");
    return 1;
  }

  if ((my @words = $m->{subject} =~ /\b[A-Z]+\b/g) >= 6) {
    $m->reject_junk_mail('Subject: header had too many all-caps words');
    return 1;
  }

# This one is a bit extreme ...
#  if ($m->{subject} eq '') {
#    $m->reject_junk_mail('Subject: header is blank or missing');
#    return 1;
#  }

# This one is a bit extreme too ...
#  if ($m->{from} =~ /^(\d+)\@/ ||
#      $m->{from} =~ /^(\d+)\@/)
#  {
#    $m->reject_junk_mail('username is all digits', "`$1'");
#    return 1;
#  }

  return 0;
}

sub has_spam_from_addresses {
  my $m = shift;

  if (my @m = $m->matches('from_addrs', $RE{bad_from})) {
    $m->reject_junk_mail('bad from address', "contained `$m[1]'");
    return 1;
  }

  return 0;
}

sub has_spam_domains_anywhere {
  my $m = shift;

  if (my @m = $m->matches('all_from_addrs', $RE{bad_origins})) {
    $m->reject_junk_mail('bad from/return address', "`$m[1]'");
    return 1;
  }

  $m->parse_received_headers();

  my $debug = 0;

  my %ips = ();

  if ($do_RBL) {
    foreach my $recv (@{$m->{recvds_array}}) {
      # Avoid various false positives
      $recv =~ s/JetMail \d\.\d\.\d\.\d\b//g;
      
      my @ips = $recv =~ m@(?<!/)\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b(?!\.)@g;
      @ips = grep { ! exists $ips{$_} } @ips;
      vprint "RBL check on $recv ...\n" if $debug && @ips;
      foreach my $ip (@ips) {
        $ips{$ip}++;
        vprint "  Checking IP $ip ... " if $debug;
        my $rbl = rbl_lookup($ip);
        if ($rbl) {
          my $reason = $rbl eq '1' ? $ip : $rbl;
          vprint ''. ($rbl || 'blacklisted') . "\n" if $debug;
          my @reject = ('blacklisted on RBL');
          push @reject, $reason if $reason;
          $m->reject_junk_mail(@reject);
          return 1;
        }
        else {
          vprint "not found\n" if $debug;
        }
      }
    }
  }

  return 0;
}

sub rbl_lookup {
  my ($ip) = @_;
  
  my @octets = split /\./, $ip;
  my $name = (join '.', reverse @octets) . '.rbl.maps.vix.com';

  my $naddr = gethostbyname($name) or return 0;
  my $A_RR = inet_ntoa($naddr);
  return 0 unless $A_RR eq '127.0.0.2';

  my $TXT;

  if ($Net_DNS_loaded) {
    my $res = Net::DNS::Resolver->new();
    my $query = $res->query($name, "TXT");
    if ($query) {
      foreach my $rr ($query->answer) {
        next unless $rr->type eq "TXT";
        $TXT = $rr->txtdata;
      }
    } else {
      print "failed: ", $res->errorstring, "\n";
    }

    $TXT =~ s/Blackholed - //;
  }
  else {
#   vprint "Net::DNS not loaded; won't find out TXT RR\n";
  }

  return $TXT || 1;
}

sub parse_received_headers {
  my $m = shift;

  return if $m->{recvd_parses_done};

  my $failed_parses_output = '';

  foreach my $recv (@{$m->{recvds_array}}) {
    $recv =~ s/\s*\n\s*/ /gm;

    my $obj = Mail::Field->new('Received', $recv);
    $obj->debug(5);

    if (! $obj->parsed_ok()) {
      # Output follows in order ...

      # First, preamble before parser errors
      $failed_parses_output .= <<EOF;
--
Error parsing Received: `$recv'

EOF

      # Then, a reminder of the message details
      $failed_parses_output .= <<EOF;
From: $m->{from}
To: $m->{to}
EOF
      $failed_parses_output .= "Cc: $m->{cc}\n" if $m->{cc};

      $failed_parses_output .= <<EOF;
Subject: $m->{subject}
Message-ID: $m->{id}

EOF

      # Finally, the incomplete parse tree
      $failed_parses_output .= Dumper($obj->parse_tree()) . "\n";

      $m->{recvd_parses_failed}++;

      $failed_parses_output .= $obj->diagnostics();
    }

    $m->{recvd_parse_trees}{$recv} = $obj->parse_tree();
  }

  $m->{recvd_parses_out} = $failed_parses_output;
  $m->{recvd_parses_done} = 1;
}

sub for_me {
  my $m = shift;
  if ($m->{to} =~ $RE{me} or $m->{cc} =~ $RE{me}) {
    return 1;
  }
  $m->reject_junk_mail('not addressed to me');
  return 0;
}

sub has_spam_content {
  my $m = shift;

  # Copy body to a single scalar
  my $all = $m->{body_scalar};

  # Strip blank and quoted lines
  my @body_lines = grep ! /^\s*$|^> /, @{$m->{body}};
  my $not_quoted = join '', @body_lines;

  my $max = $RE{max_forwards};
  my @matches = ($all =~ /^\s*(>\s*){$max,}/mg);
#  log_to_file "Lines exceeding max_forwards: ", scalar(@matches), "\n";
  if (@matches > $RE{max_forwards_lines}) {
    $m->reject_junk_mail("forwarded more than $RE{max_forwards} times");
    return 1;
  }

  # Check first few for spam
  my $first_how_many = 4;
  my ($start, $end) = (0, $first_how_many);
  $end = $#body_lines if $#body_lines < $first_how_many;
  my $first_few = join '', @body_lines[$start .. $end];
  if ($first_few =~ /^\s*
                      (Dear\ (
                        friend          |
                        .* surfer       |
                        $RE{me}
                      ))
                    /imx)
  {
    $m->reject_junk_mail('Suspicious method of address', "`$1'");
    return 1;
  }

  # Check last few for spam
  my $last_how_many = 12;

  ($start, $end) = (-$last_how_many, -1);
  if (@body_lines < $last_how_many) {
    ($start, $end) = (0, $#body_lines);
  }
  my $last_few = join '', @body_lines[$start .. $end];

  if ($last_few =~ /\bremoved?\b/i &&
      $last_few =~ /respond|notify|reply|send|forward|click|software|
                    mailto|type/ix &&
      $last_few =~ /subjec?t|process|automatically/i) {
    $m->reject_junk_mail('body confessed it was junk');
    return 1;
  }

  if ($last_few =~ /group.mail/i) {
    $m->reject_junk_mail('body suggested that a group mailer was used');
    return 1;
  }

  if ($not_quoted =~ $RE{very_bad_words}) {
    $m->reject_junk_mail('body contained a very bad word', "`$1'");
    return 1;
  }

  @matches = ($not_quoted =~ /$RE{quite_bad_words}/g);
  if (@matches > $RE{max_quite_bad_words}) {
    my %uniques = map { lc $_ => $_ } @matches;
    log_to_file "Quite bad words found in body: ",
                scalar(@matches), " (", scalar(keys %uniques), " unique)\n";
    if (scalar(keys %uniques) > $RE{max_unique_quite_bad_words}) {
      $m->reject_junk_mail('body contained too many bad words',
                      join ', ', map { "`$_'" } values %uniques);
      return 1;
    }
  }

  return 0;
}


##############################################################################
#
# Routines providing actions to taken on the e-mail being filtered.
#

sub ditch_mail {
  my $m = shift;
  log_to_file "Delivered to /dev/null",
              @_ ? " (@_)" : '',
              "\n";
}

sub deliver_to_inbox {
  my $m = shift;
  my $inbox = shift;
  $m->deliver_to("$CONFIG{inbox_dir}/$inbox", @_);
}

sub maybe_backup {
  my $m = shift;

  $m->deliver_to($CONFIG{backup_folder}) if $m->{backup};
}

sub deliver_to {
  return if $wrong_class;

  my $m = shift;
  my ($folder) = @_;

  my $file = ($folder =~ m!^/!) ? $folder : "$CONFIG{mail_dir}/$folder";

  if ($test_run) {
    log_to_file "Would deliver to $file\n";
    return;
  }

  if (! exists $out_folders{$file}) {
    check_file_dir($file);

    if ($Mail_Folder) {
      # Use Mail::Folder
      if (-e $file) {
        vprint "Opening $file for appending ... ";
        $out_folders{$file} = new Mail::Folder('AUTODETECT', $file);
        vprint "done.\n";
      }
      else {
        $out_folders{$file} = new Mail::Folder('mbox', $file, Create => 1);
      }
    }
    else {
      # Don't use Mail::Folder
      local *FH;
      unless (open(FH, ">>$file")) {
        fatal("Couldn't open $file for delivery: $!");
        exit 5;
      }
      vprint "Opened $file for appending.\n";
      $out_folders{$file} = *FH;
    }
  }

  my $out = $out_folders{$file};
  if (ref($out) eq 'Mail::Folder') {
    $out->append_message($m->{mail});
  }
  else {
    my $text = $m->{mail}->as_mbox_string();
    $text =~ s/^Mail-From: /From /;
    $text =~ s/\n+$/\n\n/;
    print $out $text;
  }

  log_to_file "Delivered to $file\n";
}

sub accept_mail {
  return if $wrong_class;

  my $m = shift;
  my ($reason_ident, @details) = @_;

  $m->{accepted} = [ $reason_ident, @details ];

  my $text = "$reason_ident" .
             (@details ? " (@details)" : '');

  $m->{header}->add('X-Parp-Accepted', $text);
  log_to_file "Accepted: $text\n";
}

sub reject_junk_mail {
  return if $wrong_class;
  my $m = shift;
  $m->reject_mail(@_);
#  $m->{backup} = 0;
  $m->deliver_to_inbox('junk-mail');
}

sub reject_mail {
  return if $wrong_class;

  my $m = shift;
  my ($reason_ident, @details) = @_;

  $m->{rejected} = [ $reason_ident, @details ];

  my $text = "$reason_ident" .
             (@details ? " (@details)" : '') .
             "\n";

  $m->{header}->add('X-Parp-Rejected', $text);
  log_to_file "REJECTED: $text";
}

sub pipe_forward {
  return if $wrong_class;

  my $m = shift;
  my ($pipe_command) = @_;

  if ($dry_pipes || $test_run) {
    log_to_file "Would pipe | $pipe_command\n";
  }
  else {
    log_to_file "Piping | $pipe_command ... ";
    if (! open(PIPE, "| $pipe_command")) {
      fatal("Couldn't open pipe command $pipe_command: $!");
    }
    else {
      print PIPE $m->{mail}->as_mbox_string();
      close(PIPE);
      log_to_file "done.\n";
    }
  }
}


##############################################################################
#
# Miscellaneous routines
#

sub check_file_dir {
  my ($file) = @_;

  my $umask = 0700;

  my ($dir) = $file =~ m!(.*)/!;
  return unless $dir;

  unless (-d $dir) {
    mkpath([$dir], 0, $umask);
    vprint sprintf "Created directory $dir with umask %04o.\n", $umask
      if fileno(LOG);
  }
}

sub fatal {
  my ($error, @context) = @_;

  my $long_message = <<EOF;

*************************************************************************

A fatal error occurred:

$error
EOF

  if (@context) {
    $long_message .= <<EOF;
Context for the error follows below:

-------- 8< -------- 8< --------
EOF

  $long_message .= join '', @context;

  $long_message .= <<EOF;
-------- 8< -------- 8< --------
EOF
  }
  else {
    $long_message .= <<EOF;

There was no context given for the error.
EOF
  }

  $long_message .= <<EOF;

*************************************************************************
EOF

  log_to_file $long_message if fileno LOG;
  print $long_message if $verbose;

  warn "parp: $error (fatal)" .
       ((fileno LOG) ? "; see log file $CONFIG{log_file} for context.\n"
                    : ".\n");

  if ($CONFIG{fatals_folder}) {
    my $date = localtime();
    my ($username, $realname) = (getpwuid $<)[0, 6];
    my $report = <<"End of report";
From $username\@localhost $date
From: "parp e-mail filter" <$username\@localhost>
To: "$realname" <$username\@localhost>
Date: $date
Subject: parp experienced a fatal error

$long_message

End of report

    if (open FATALS, ">>$CONFIG{fatals_folder}") {
      print FATALS $report;
      close FATALS;
    }
  }
}

sub log_to_file {
  my (@msgs) = @_;
  if ($test_run) {
    print @msgs;
  }
  else {
    if (fileno LOG) {
      print LOG @msgs;
    }
    else {
      open(BROKEN, ">>/home/adam/mail/.parp.broken") or die "argh!!! $!";
      print BROKEN "fileno(LOG) undef:\n@msgs\n";
      close(BROKEN);
    }
  }
}
