X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bin%2Fdebbugs-spam;h=1c44290543420946f53db6942b0c4de3a50961a6;hb=b10a87938eb116b8871081643058052564e39603;hp=4e5d3538ba6ae8b76cc991a1c98bace45e000187;hpb=bbc81477dea7d070a1240a11a355bb0f4cd6c59c;p=debbugs.git diff --git a/bin/debbugs-spam b/bin/debbugs-spam index 4e5d353..1c44290 100755 --- a/bin/debbugs-spam +++ b/bin/debbugs-spam @@ -62,6 +62,12 @@ ham. Output the score of all of the messages in a bug +=over + +=item B<--skip-seen> Skip messages which have previously been classified + +=back + =item B Mark messages as spam if there is a regex match to subject or message @@ -72,11 +78,25 @@ id Mark messages as ham if there is a regex match to subject or message id +=item B + +Learn from messages which are ham/spam + =back =head1 EXAMPLES +Start spamd: + + /usr/sbin/spamd --socketpath=/home/debbugs/spamd_socket \ + --nouser-config --cf='include /home/debbugs/.spamassassin/user_prefs' \ + --cf='allow_user_rules 1' --allow-tell; + +Then score bugs: + + debbugs-spam --spamc-opts '-U' --spamc-opts '/home/debbugs/spamd_socket' \ + score 859123; =cut @@ -112,15 +132,20 @@ handle_main_arguments(\%options, my %subcommands = ('auto-scan' => {function => \&auto_spamscan, - arguments => {'ham_threshold=s' => -5, + arguments => {'ham_threshold|ham-threshold=s' => 0, }, + defaults => {ham_threshold => -5}, }, 'score' => {function => \&score_bug, + arguments => {'skip_seen|skip-seen!' => 0 + }, }, 'mark-spam' => {function => \&mark_spam, }, 'mark-ham' => {function => \&mark_ham, }, + 'learn' => {function => \&learn, + }, 'help' => {function => sub {pod2usage({verbose => 2});}} ); @@ -151,7 +176,10 @@ if ($subcommand ne 'help') { chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!"; } my $opts = - handle_subcommand_arguments(\@ARGV,$subcommands{$subcommand}{arguments}); + handle_subcommand_arguments(\@ARGV, + $subcommands{$subcommand}{arguments}, + $subcommands{$subcommand}{defaults}, + ); $subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV); @@ -174,17 +202,55 @@ sub mark_it { my $body = $rec->{text}; my ($subject) = $body =~ /^Subject: *(.+)$/mi; my $is_match = 0; - if ($subject =~ /\Q$regex\E/) { + if ($subject =~ /$regex/) { $is_match = 1; } - if ($mid =~ /\Q$regex\E/) { + if ($mid =~ /$regex/) { $is_match = 1; } - if ($spam_ham eq 'spam') { - $spam->add_spam($mid); + if ($is_match) { + print STDERR "it's a match" if $DEBUG; + if ($spam_ham eq 'spam') { + $spam->add_spam($mid); + } else { + $spam->add_ham($mid); + } + } + }, + $bug_num + ); + $spam->save(); + } +} + +sub learn { + my ($options,$opts,$config,$argv) = @_; + for my $bug_num (@{$argv}) { + my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or + die "Unable to open bug log spam for $bug_num"; + foreachmsg(sub { + my ($bn,$rec,$mid) = @_; + my $score; + if ($spam->is_spam($mid)) { + $score //= + spam_score($rec,$options->{spamc}, + [@{$options->{spamc_opts}}, + '-L','spam' + ] + ); + print STDERR "learning spam" if $DEBUG; + } elsif ($spam->is_ham($mid)) { + $score //= + spam_score($rec,$options->{spamc}, + [@{$options->{spamc_opts}}, + '-L','ham' + ] + ); + print STDERR "learning ham" if $DEBUG; } else { - $spam->add_ham($mid); + print STDERR "not learning" if $DEBUG; } + print STDERR " from $mid" if $DEBUG; }, $bug_num ); @@ -198,7 +264,9 @@ sub score_bug { my @bug_score = spam_score_bug($bug_num, $options->{spamc}, - $options->{spamc_opts}); + $options->{spamc_opts}, + $opts->{skip_seen}, + ); print "$_->{score} $_->{message_id} $_->{subject}\n" foreach @bug_score; } @@ -228,7 +296,7 @@ sub auto_spamscan { if ($is_spam) { print STDERR "it's spam ($score)\n" if $DEBUG; $spam->add_spam($mid); - } elsif ($score < $options->{ham_threshold}) { + } elsif ($score < $opts->{ham_threshold}) { print STDERR "it's really ham ($score)\n" if $DEBUG; $spam->add_ham($mid); } @@ -243,12 +311,25 @@ sub auto_spamscan { } sub spam_score_bug { - my ($bug,$spamc,$spamc_opts) = @_; + my ($bug,$spamc,$spamc_opts,$skip_seen) = @_; + my $spam; + if ($skip_seen) { + $spam = Debbugs::Log::Spam->new(bug_num => $bug) or + die "Unable to open bug log spam for $bug"; + } my @records; foreachmsg(sub { my ($bn,$rec,$mid) = @_; - my $score = + my $score; + if ($skip_seen) { + if ($spam->is_spam($mid)) { + $score = 999; + } elsif ($spam->is_ham($mid)) { + $score = -999; + } + } + $score //= spam_score($rec,$spamc,$spamc_opts); my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi; push @records, @@ -267,6 +348,9 @@ sub spam_score { my ($score,$threshold,$report); my $is_spam = 0; eval { + $report = ''; + $score = 0; + $threshold = 5; my ($spamc_in,$spamc_out); my $old_sig = $SIG{"PIPE"}; $SIG{"PIPE"} = sub { @@ -274,7 +358,7 @@ sub spam_score { }; my $childpid = open3($spamc_in,$spamc_out,0, - $spamc,'-E',@{$spamc_opts}) or + $spamc,'-E','--headers',@{$spamc_opts}) or die "Unable to fork spamc: $!"; if (not $childpid) { die "Unable to fork spamc"; @@ -282,19 +366,28 @@ sub spam_score { print {$spamc_in} $record->{text}; close($spamc_in) or die "Unable to close spamc_in: $!"; waitpid($childpid,0); - if ($? >> 8) { + my $exit_code = $? >> 8; + if ($exit_code) { $is_spam = 1; } - my ($first_line,@report) = <$spamc_out>; - if ($DEBUG) { - print STDERR "[$?;".($? >> 8)."] "; - print STDERR $first_line,@report; - print STDERR " "; + my $in_spam_header = 0; + while (<$spamc_out>) { + if (/^X-Spam/ or (/^\s+/ and $in_spam_header)) { + $in_spam_header = 1; + $report .= $_; + if (/^X-Spam-Status: (Yes|No), score=(-?[\d\.]+) required=(-?[\d\.]+)/) { + $threshold = $3; + $score = $2; + } + } else { + $in_spam_header = 0; + } + if (/^\s*$/) { + last; + } } - if (defined $first_line) { - chomp $first_line; - ($score,$threshold) = $first_line =~ m{^(-?[\d\.]+)/(-?[\d\.]+)$}; - $report = join('',@report); + if ($DEBUG) { + print STDERR "[$exit_code] [$score/$threshold]\n$report\n"; } close($spamc_out); $SIG{"PIPE"} = $old_sig; @@ -317,17 +410,18 @@ sub foreachmsg { my ($msg_id) = record_regex($record, qr/^Message-Id:\s+<(.+)>/mi); next unless defined $msg_id; + print STDERR "examining $msg_id: " if $DEBUG; if ($msg_id =~ /$config{email_domain}$/) { - print STDERR "skipping $msg_id\n" if $DEBUG; + print STDERR "skipping\n" if $DEBUG; next; } - print STDERR "examining $msg_id: " if $DEBUG; if ($seen_msgids{$msg_id}) { print STDERR "already seen\n" if $DEBUG; next; } $seen_msgids{$msg_id}=1; $sub->($bug_num,$record,$msg_id); + print STDERR "\n" if $DEBUG; } }