#!/usr/bin/perl -w

# settings are located in $HOME/.corpus

use strict;
use Getopt::Long;
use File::Path;
use File::Copy;
use Time::ParseDate;
use POSIX qw(nice strftime);
use Cwd;

our ( $opt_dir, $opt_override, $opt_tag );
GetOptions(
    "tag=s" => \$opt_tag,
    "dir=s" => \$opt_dir,
    "override=s" => \$opt_override,
);

$opt_override ||= '';
$opt_tag ||= 'n';       # nightly is the default

nice(15);

# daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before
# the time of day when the mass-check tagging occurs; see
# http://wiki.apache.org/spamassassin/DateRev for more details.
use constant DATEREV_ADJ => - (8 * 60 * 60);

# ---------------------------------------------------------------------------

my $configuration = "$ENV{HOME}/.corpus";
my %cf;

configure();
init();

if (!$opt_dir) {
  $opt_dir = $cf{corpus};
  update_rsync();
}

chdir $opt_dir;
print "reading logs from '$opt_dir'\n";

my $linkdir = "$cf{html}/logs";
(-d $linkdir) or mkdir $linkdir;

locate_and_link();
exit;

sub configure {
  # does rough equivalent of source
  open(C, $configuration) || die "open failed: $configuration: $!\n";
  my $pwd = Cwd::getcwd;

  # add 'override' options
  my @lines = (<C>, split(/\|/, $opt_override));

  foreach $_ (@lines) {
    chomp;
    s/#.*//;
    if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
      my ($key, $val) = ($1, $2);
      $val =~ s/\$PWD/$pwd/gs;
      $cf{$key} = $val;
    }
  }
  close(C);
}

sub init {
  $ENV{RSYNC_PASSWORD} = $cf{password};
  $ENV{TIME} = '%e,%U,%S';
  $ENV{TZ} = 'UTC';
}

sub update_rsync {
  chdir $opt_dir;
  if (!$cf{rsync_command}) { die "no 'rsync_command' set"; }
  system $cf{rsync_command};
  system "chmod +r *.log > /dev/null 2>&1";
}

sub locate_and_link {
  opendir(CORPUS, $opt_dir);
  my @files = sort readdir(CORPUS);
  closedir(CORPUS);

  print "Found ", $#files + 1, " files in $opt_dir\n";

  @files = grep {
    /^(?:spam|ham)-(?:net-)?[-\w.]+\.log$/ && -f "$opt_dir/$_" && -M _ < 30 
  } @files;

  print "Kept ", $#files + 1, " files\n";

  foreach my $file (@files) {
    my $ftime;
    my $frevision;

    print "Processing $opt_dir/$file\n";
    open(FILE, "$opt_dir/$file") or warn "cannot read $opt_dir/$file";
    while (my $line = <FILE>) {
      last if $line !~ /^#/;
      if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
        my ($datepre, $hh, $datepost) = ($1,$2,$3);
        
        $ftime = Time::ParseDate::parsedate($datepre.$hh.$datepost,
                    GMT => 1, PREFER_PAST => 1);
      }
      elsif ($line =~ m/^# Date:\s*(\S+)/) {
        # a better way to do the above.  TODO: parse it instead
      }
      elsif ($line =~ m/^# SVN revision:\s*(\S+)/) {
        $frevision = $1;
      }
    }
    close(FILE);

    if (!defined $ftime) {
      warn "$opt_dir/$file: no time found, ignored\n"; next;
    }
    if (!defined $frevision) {
      warn "$opt_dir/$file: no revision found, ignored\n"; next;
    }
    if ($frevision eq 'unknown') {
      warn "$opt_dir/$file: not tagged with a revision, ignored\n"; next;
    }

    my $daterev = mk_daterev($ftime, $frevision, $opt_tag);
    link_file($file, $daterev);
  }
}

sub mk_daterev {
  my ($timet, $rev, $tag) = @_;
  return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag";
}

sub link_file {
  my ($file, $daterev) = @_;

  my $f = "$opt_dir/$file";

  # /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/
  my $linkfile = $file;
  my $dr = $daterev; $dr =~ s/\//-/gs; $linkfile =~ s/\.log$/.$dr.log/i;
  my $t = "$linkdir/$linkfile";

  print "ln $f $t\n";
  (-f $t) and unlink $t;
  # cannot hardlink unless we have ownership or RW perms on the file
  symlink $f, $t or die "cannot ln";
  system "/usr/bin/touch -h -r '$f' '$t'";       # preserve modtimes
}

