#!/usr/bin/perl -w
#Syn-3 spam learning cronjob - (C) 2005 DatuX 
#Edwin Eefting

use strict;
my $maildir="/home/system/cyrus-imap/maildir/user";
my $mailcachefile="/home/system/cyrus-dspam/mails.cache";
my $learnedfile="/home/system/cyrus-dspam/learn.cache";
my $test=0;

if ($ARGV[0])
{
    $test=1;
    print "TEST MODE - Not changing anything!\n";
}


#read mailcache, so that we know what's processed
my %mailcache;
open (MAILCACHE,$mailcachefile);
while (<MAILCACHE>)
{
	chomp();	
	#1=processed before, not checked this time after processing
	#2=processed before, and still there
	$mailcache{$_}=1;
}
close (MAILCACHE);

#read learning list, so that we don't learn something twice and
#can unlearn stuff as well
my %learned;
open (LEARNED,$learnedfile);
while (<LEARNED>)
{
	chomp();
	my ($signature,$time)=split(/=/,$_);	
	$learned{$signature}=$time;
	#$learned{$signature}{'mode'}=$mode;
}
close (LEARNED);


#learn by correcting a message with dspam
sub Learn
{
	my ($class,$user,$signature)=@_;
	$user=~ s/\'/_/g;
	$signature=~ s/\'/_/g;
	$class=~ s/\'/_/g;
	if (!exists($learned{$signature}))
	{
		system("dspam --mode=teft --source=error --user '$user' --signature='$signature' --class='$class'");
	}
	else
	{
		print "(Skipping, already learned)\n";
	}
	$learned{$signature}=time();
};


open (USERS,"find '$maildir' -maxdepth 1 -printf '%f\n'|") or die ;
my $user;
while ($user=<USERS>)
{
	chomp ($user);
	if (-e "$maildir/$user/Spam")
	{
		print "Processing user $user\n"; 
	}
	else
	{
		print "Skipping user $user (no Spam folder)\n";
		next;
	}
	#find all mails, but skip trash and send mails
	open (MAILS,"find '$maildir/$user' -type f -name '*.'|grep -v '/Trash/'|");
	my $mail;
	my $reconstruct=0;
	while ($mail=<MAILS>)
	{
		chomp($mail);
		#not yet processed?
		#TODO: only process if the mail is in the folder for more than 24 hours?
		#(in case the user made a mistake and can move it back within 24 hours)
		if (! $mailcache{$mail})
		{
			#read first 5k of mail
			open (MAILDATA,$mail);
			my $maildata;
			read(MAILDATA,$maildata,5000);
			my ($headercheck,$spamresult)=($maildata=~ /(.*)\nX-DSPAM-Result: ([a-zA-Z]*)\r/s);
			my ($signature)=($maildata=~ /.*\nX-DSPAM-Signature: ([0-9a-zA-Z]*)\r/s);
			if ($spamresult && !($headercheck=~/\r\n\r\n/))
			{
				if ($test)
				{
					print "Test: Found $mail ($signature) result $spamresult\n";
				}
				#is it NOT spam, in the spamfolder?
				if ($spamresult ne 'Spam' && $mail=~/Spam/)
				{
					if ($test)
					{
						print "Test: Would have corrected to SPAM: $mail ($signature)\n";
					}
					else
					{
						#learn as spam 
						print "Correcting to SPAM: $mail ($signature)\n";
						Learn("spam",$user,$signature);
						#delete it 
						unlink("$mail");
						#mailbox needs reconstruction because of our action
						$reconstruct=1;
					}
				}
				#is it spam, but NOT in spamfolder?
				elsif ($spamresult eq 'Spam' && ! ($mail=~/Spam/))
				{
					if ($test)
					{
						print "Test: Would have corrected to INNOCENT: $mail ($signature)\n";
					}
					else
					{
						#learn as innocent
						print "Correcting to INNOCENT: $mail ($signature)\n";
						Learn("innocent",$user,$signature);
					}
				}
				#other combinations
				else
				{
					#TODO: program could be extended to UNLEARN mails that are
				}
			}
			else
			{	
				#unkown, just ignore it to be safe
				if ($test)
				{
					print "Test: Cant find headers in $mail?\n";
				}
			}
			
		}
		#mark as found and processed
		$mailcache{$mail}=2;
	}
	close(MAILS);
	
	if (!$test && $reconstruct)
	{
		print "Reconstructing spamfolder of $user...\n";
		system ("su - cyrus -c '/usr/cyrus/bin/reconstruct user.$user.Spam'");
	}

}


if ($test)
{
	print "Test mode, not storing caches\n";
	exit;
}

#store new mailcache
open (MAILCACHE,">$mailcachefile");
foreach my $mail (keys(%mailcache))
{
	#2=mail processed this time
	if ($mailcache{$mail}==2)
	{
		print MAILCACHE "$mail\n";
	}	
}
close (MAILCACHE);


#store new learned list
open (LEARNED,">$learnedfile");
foreach my $signature (keys(%learned))
{
	#TODO:remove old entries
	print LEARNED "$signature=$learned{$signature}\n";	
}
close (LEARNED);


