]> git.donarmstrong.com Git - debbugs.git/blobdiff - bin/debbugs-spam
strip of the envelope from
[debbugs.git] / bin / debbugs-spam
index b65c940728e7ff853346ec388df6baf64a4f7dc2..a0d41887a605b5bdd07743f540ab1319479748e4 100755 (executable)
@@ -48,15 +48,44 @@ Display this manual.
 
 =back
 
-=head1 EXAMPLES
+=head1 SUBCOMMANDS
+
+=over
+
+=item B<auto-scan>
+
+Automatically scan messages using spamassassin and mark messages as
+spam which hit the threshold, and those that are highly negative as
+ham.
+
+=item B<score>
+
+Output the score of all of the messages in a bug
+
+=over
+
+=item B<--skip-seen> Skip messages which have previously been classified
 
-Rebuild the index.db for db-h.
+=back
+
+=item B<mark-spam>
+
+Mark messages as spam if there is a regex match to subject or message
+id
+
+=item B<mark-ham>
+
+Mark messages as ham if there is a regex match to subject or message
+id
 
- debbugs-spam;
+=item B<learn>
 
-Rebuild the index.db for archive
+Learn from messages which have been marked as spam/ham
+
+=back
 
- debbugs-spam archive;
+
+=head1 EXAMPLES
 
 
 =cut
@@ -67,7 +96,10 @@ use vars qw($DEBUG);
 use Debbugs::Log qw(record_regex);
 use Debbugs::Log::Spam;
 use Debbugs::Config qw(:config);
+use Debbugs::Command qw(:all);
+use Debbugs::Common qw(getbuglocation);
 use IPC::Open3 qw(open3);
+use Carp;
 
 my %options =
     (debug   => 0,
@@ -75,20 +107,45 @@ my %options =
      man     => 0,
      verbose => 0,
      quiet   => 0,
-     quick   => 0,
      spamc   => 'spamc',
      spamc_opts => [],
     );
 
-
-GetOptions(\%options,
-           'quick|q',
-           'service|s',
-           'sysconfdir|c',
-           'spool_dir|spool-dir=s',
-           'spamc=s',
-           'spamc_opts|spamc-opts=s@',
-           'debug|d+','help|h|?','man|m');
+handle_main_arguments(\%options,
+                      'service|s',
+                      'sysconfdir|c',
+                      'spamc=s' => 0,
+                      'spamc_opts|spamc-opts=s@' => 0,
+                      'spool_dir|spool-dir=s',
+                      'quiet|q:+',
+                      'verbose|v:+',
+                      'debug|d+','help|h|?','man|m');
+
+my %subcommands =
+    ('auto-scan' => {function => \&auto_spamscan,
+                     arguments => {'ham_threshold|ham-threshold=s' => 0,
+                                   'skip_missing|skip-missing!' => 0,
+                                  },
+                     defaults => {ham_threshold => -5,
+                                  skip_missing => 0,
+                                 },
+                    },
+     'score' => {function => \&score_bug,
+                 arguments => {'skip_seen|skip-seen!' => 0
+                              },
+                },
+     'mark-spam' => {function => \&mark_spam,
+                    },
+     'mark-ham' => {function => \&mark_ham,
+                   },
+     'learn' => {function => \&learn,
+                 arguments => {'skip_missing|skip-missing!' => 0,
+                              },
+                 defaults => {skip_missing => 0,
+                             },
+                },
+     'help' => {function => sub {pod2usage({verbose => 2});}}
+    );
 
 pod2usage() if $options{help};
 pod2usage({verbose=>2}) if $options{man};
@@ -98,86 +155,290 @@ $DEBUG = $options{debug};
 my @USAGE_ERRORS;
 $options{verbose} = $options{verbose} - $options{quiet};
 
-if (not @ARGV) {
-    push @USAGE_ERRORS,
-        "You must provide a bug number to examine\n";
-}
-
 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
 
+my ($subcommand) = shift @ARGV;
+if (not defined $subcommand) {
+    $subcommand = 'help';
+    print STDERR "You must provide a subcommand; displaying usage.\n";
+    pod2usage();
+} elsif (not exists $subcommands{$subcommand}) {
+    print STDERR "$subcommand is not a valid subcommand; displaying usage.\n";
+    pod2usage();
+}
+
 if (exists $options{spool_dir} and defined $options{spool_dir}) {
     $config{spool_dir} = $options{spool_dir};
 }
-chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
+if ($subcommand ne 'help') {
+    chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
+}
+my $opts =
+    handle_subcommand_arguments(\@ARGV,
+                                $subcommands{$subcommand}{arguments},
+                                $subcommands{$subcommand}{defaults},
+                               );
+$subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV);
+
+
+sub mark_ham {
+    mark_it('ham',@_);
+}
+
+sub mark_spam {
+    mark_it('spam',@_);
+}
+
+sub mark_it {
+    my ($spam_ham,$options,$opts,$config,$argv) = @_;
+    my $regex = shift @{$argv};
+    for my $bug_num (@{$argv}) {
+        my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
+            die "Unable to open bug log spam for $bug_num";
+        foreachmsg(sub {
+                       my ($bn,$rec,$mid) = @_;
+                       my $body = $rec->{text};
+                       my ($subject) = $body =~ /^Subject: *(.+)$/mi;
+                       my $is_match = 0;
+                       if ($subject =~ /\Q$regex\E/) {
+                           $is_match = 1;
+                       }
+                       if ($mid =~ /\Q$regex\E/) {
+                           $is_match = 1;
+                       }
+                       if ($is_match) {
+                           print STDERR "it's a match" if $DEBUG;
+                           if ($spam_ham eq 'spam') {
+                               $spam->add_spam($mid);
+                           } else {
+                               $spam->add_ham($mid);
+                           }
+                       }
+                   },
+                   $bug_num
+                  );
+        $spam->save();
+    }
+}
+
+
+sub score_bug {
+    my ($options,$opts,$config,$argv) = @_;
+    for my $bug_num (@{$argv}) {
+        my @bug_score =
+            spam_score_bug($bug_num,
+                           $options->{spamc},
+                           $options->{spamc_opts},
+                           $opts->{skip_seen},
+                          );
+        print "$_->{score} $_->{message_id} $_->{subject}\n"
+            foreach @bug_score;
+    }
+}
+
+sub learn {
+    my ($options,$opts,$config,$argv) = @_;
+
+    for my $bug_num (@{$argv}) {
+        if ($opts->{skip_missing} and
+            not defined getbuglocation($bug_num,'log')) {
+            print STDERR "bug $bug_num does not exist\n" if $options->{verbose} > -1;
+            next;
+        }
+        my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
+            die "Unable to open bug log spam for $bug_num";
+        foreachmsg(sub {
+                       my ($bn,$rec,$mid) = @_;
+                       if ($spam->is_spam($mid)) {
+                           spamc_learn_spam($rec,$options->{spamc},$options->{spamc_opts});
+                           return;
+                       }
+                       if ($spam->is_ham($mid)) {
+                           spamc_learn_ham($rec,$options->{spamc},$options->{spamc_opts});
+                           return;
+                       }
+                   },
+                   $bug_num,
+                  );
+        $spam->save();
+    }
+}
+
+sub auto_spamscan {
+    my ($options,$opts,$config,$argv) = @_;
+
+    for my $bug_num (@{$argv}) {
+        if ($opts->{skip_missing} and
+            not defined getbuglocation($bug_num,'log')) {
+            print STDERR "bug $bug_num does not exist\n" if $options->{verbose} > -1;
+            next;
+        }
+        my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
+            die "Unable to open bug log spam for $bug_num";
+        foreachmsg(sub {
+                       my ($bn,$rec,$mid) = @_;
+                       if ($spam->is_spam($mid)) {
+                           print STDERR "already spam\n" if $DEBUG;
+                           return;
+                       }
+                       if ($spam->is_ham($mid)) {
+                           print STDERR "already ham\n" if $DEBUG;
+                           return;
+                       }
+                       my ($score,$is_spam,$report,$threshold) =
+                           spam_score($rec,
+                                      $options->{spamc},
+                                      $options->{spamc_opts},
+                                     );
+                       if ($is_spam) {
+                           print STDERR "it's spam ($score)\n" if $DEBUG;
+                           $spam->add_spam($mid);
+                       } elsif ($score < $opts->{ham_threshold}) {
+                           print STDERR "it's really ham ($score)\n" if $DEBUG;
+                           $spam->add_ham($mid);
+                       }
+                       else {
+                           print STDERR "it's ham ($score)\n" if $DEBUG;
+                       }
+                   },
+                   $bug_num,
+                  );
+        $spam->save();
+    }
+}
+
+sub spam_score_bug {
+    my ($bug,$spamc,$spamc_opts,$skip_seen) = @_;
+
+    my $spam;
+    if ($skip_seen) {
+        $spam = Debbugs::Log::Spam->new(bug_num => $bug) or
+            die "Unable to open bug log spam for $bug";
+    }
+    my @records;
+    foreachmsg(sub {
+                   my ($bn,$rec,$mid) = @_;
+                   my $score;
+                   if ($skip_seen) {
+                       if ($spam->is_spam($mid)) {
+                           $score = 999;
+                       } elsif ($spam->is_ham($mid)) {
+                           $score = -999;
+                       }
+                   }
+                   $score //=
+                       spam_score($rec,$spamc,$spamc_opts);
+                   my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi;
+                   push @records,
+                      {message_id => $mid,
+                       score => $score,
+                       subject => $subject,
+                      };
+               },
+               $bug
+              );
+    return @records;
+}
+
+sub add_return_path {
+    my ($message) = @_;
+    ## debbugs is kind of odd, and puts "Received:" first, them "From
+    ## ", and doesn't have a Return-Path. Fix that up so spamassassin
+    ## is happy.
+    $message =~
+        s{^(Received: \(at \S+\) by \S+;[^\n]+\n)(From (\S+) [^\n]+\n)}
+        {Return-path: $3\n$1};
+    return $message;
+}
+
+sub spamc_bug {
+    my ($record,$spamc,$spamc_opts) = @_;
+    my $first_line = '';
+    my $report = '';
+    my $exit_code = 0;
+    eval {
+        my ($spamc_in,$spamc_out);
+        my $old_sig = $SIG{"PIPE"};
+        $SIG{"PIPE"} = sub {
+            die "SIGPIPE in child for some reason";
+        };
+        my $childpid =
+            open3($spamc_in,$spamc_out,0,
+                  $spamc,@{$spamc_opts}) or
+                      die "Unable to fork spamc: $!";
+        if (not $childpid) {
+            die "Unable to fork spamc";
+        }
+        print STDERR add_return_path($record->{text}) if $DEBUG > 1;
+        print {$spamc_in} add_return_path($record->{text});
+        close($spamc_in) or die "Unable to close spamc_in: $!";
+        waitpid($childpid,0);
+        if ($? >> 8) {
+            $exit_code = $? >> 8;
+        }
+        local $/;
+        $report = <$spamc_out>;
+        close($spamc_out);
+        $SIG{"PIPE"} = $old_sig;
+    };
+    if ($@) {
+        carp "processing of message failed [$@]\n";
+        return undef;
+    }
+    return ($exit_code,$report);
+}
+
+sub spam_score {
+    my ($record,$spamc,$spamc_opts) = @_;
+    my ($score,$threshold,$report,$exit_code);
+    ($exit_code,$report) =
+        spamc_bug($record,$spamc,[@{$spamc_opts},'-c']);
+    if (defined $report) {
+        ($score,$threshold) = $report =~ s{^(-?[\d\.]+)/(-?[\d\.]+)\n?}{};
+    }
+    return wantarray?($score,$exit_code,$report):$score;
+}
+
+sub spamc_learn_ham {
+    spamc_learn('ham',@_);
+}
+
+sub spamc_learn_forget {
+    spamc_learn('forget',@_);
+}
+
+sub spamc_learn_spam {
+    spamc_learn('spam',@_);
+}
+
+sub spamc_learn {
+    my ($type,$record,$spamc,$spamc_opts) = @_;
+    spamc_bug($record,$spamc,[@{$spamc_opts},'-L',$type])
+}
 
-for my $bug_num (@ARGV) {
+sub foreachmsg {
+    my ($sub,$bug_num) = @_;
     my $log = Debbugs::Log->new(bug_num => $bug_num) or
         die "Unable to open bug log for $bug_num";
-    my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
-        die "Unable to open bug log spam for $bug_num";
-
     my %seen_msgids;
     while (my $record = $log->read_record()) {
-        next if $record->{type} eq 'html';
-        next if $record->{type} eq 'autocheck';
+        next unless $record->{type} eq 'incoming-recv';
         my ($msg_id) = record_regex($record,
                                     qr/^Message-Id:\s+<(.+)>/mi);
         next unless defined $msg_id;
+        print STDERR "examining $msg_id: " if $DEBUG;
         if ($msg_id =~ /$config{email_domain}$/) {
-            print STDERR "skipping $msg_id\n" if $DEBUG;
+            print STDERR "skipping\n" if $DEBUG;
             next;
         }
-        print STDERR "examining $msg_id: " if $DEBUG;
         if ($seen_msgids{$msg_id}) {
             print STDERR "already seen\n" if $DEBUG;
             next;
         }
         $seen_msgids{$msg_id}=1;
-        if ($spam->is_spam($msg_id)) {
-            print STDERR "already spam\n" if $DEBUG;
-            next;
-        }
-        my $is_spam;
-        eval {
-            my ($spamc,$child_out);
-            my $old_sig = $SIG{"PIPE"};
-            $SIG{"PIPE"} = sub {
-                die "SIGPIPE in child for some reason";
-            };
-            my $childpid =
-                open3($spamc,$child_out,0,
-                      $options{spamc},'-E',@{$options{spamc_opts}}) or
-                          die "Unable to fork spamc: $!";
-            if (not $childpid) {
-                die "Unable to fork spamc";
-            }
-            print {$spamc} $record->{text};
-            close($spamc) or die "Unable to close spamc: $!";
-            waitpid($childpid,0);
-            if ($DEBUG) {
-                print STDERR "[$?;".($? >> 8)."] ";
-                print STDERR map {s/\n//; $_ } <$child_out>;
-                print STDERR " ";
-            }
-            close($child_out);
-            $SIG{"PIPE"} = $old_sig;
-            if ($? >> 8) {
-                $is_spam = 1;
-            }
-        };
-        if ($@) {
-            print STDERR "processing of $msg_id failed [$@]\n";
-        } else {
-            if ($is_spam) {
-                print STDERR "it's spam\n" if $DEBUG;
-                $spam->add_spam($msg_id);
-            }
-            else {
-                print STDERR "it's ham\n" if $DEBUG;
-            }
-        }
+        $sub->($bug_num,$record,$msg_id);
+        print STDERR "\n" if $DEBUG;
     }
-    $spam->save();
 }