X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bin%2Fdebbugs-spam;h=f416fb339386bf0a5c3e281ff59d025ce9f9930b;hb=0e67d0e2daba4319f2d2e6ff368a9ba3e39789ed;hp=b65c940728e7ff853346ec388df6baf64a4f7dc2;hpb=4c6b11d4632b6f9e5858757b4f687b8a712441f4;p=debbugs.git diff --git a/bin/debbugs-spam b/bin/debbugs-spam index b65c940..f416fb3 100755 --- a/bin/debbugs-spam +++ b/bin/debbugs-spam @@ -48,15 +48,40 @@ Display this manual. =back -=head1 EXAMPLES +=head1 SUBCOMMANDS + +=over + +=item B -Rebuild the index.db for db-h. +Automatically scan messages using spamassassin and mark messages as +spam which hit the threshold, and those that are highly negative as +ham. - debbugs-spam; +=item B -Rebuild the index.db for archive +Output the score of all of the messages in a bug - debbugs-spam archive; +=over + +=item B<--skip-seen> Skip messages which have previously been classified + +=back + +=item B + +Mark messages as spam if there is a regex match to subject or message +id + +=item B + +Mark messages as ham if there is a regex match to subject or message +id + +=back + + +=head1 EXAMPLES =cut @@ -67,7 +92,9 @@ use vars qw($DEBUG); use Debbugs::Log qw(record_regex); use Debbugs::Log::Spam; use Debbugs::Config qw(:config); +use Debbugs::Command qw(:all); use IPC::Open3 qw(open3); +use Carp; my %options = (debug => 0, @@ -80,15 +107,31 @@ my %options = spamc_opts => [], ); - -GetOptions(\%options, - 'quick|q', - 'service|s', - 'sysconfdir|c', - 'spool_dir|spool-dir=s', - 'spamc=s', - 'spamc_opts|spamc-opts=s@', - 'debug|d+','help|h|?','man|m'); +handle_main_arguments(\%options, + 'quick|q', + 'service|s', + 'sysconfdir|c', + 'spamc=s' => 0, + 'spamc_opts|spamc-opts=s@' => 0, + 'spool_dir|spool-dir=s', + 'debug|d+','help|h|?','man|m'); + +my %subcommands = + ('auto-scan' => {function => \&auto_spamscan, + arguments => {'ham_threshold|ham-threshold=s' => 0, + }, + defaults => {ham_threshold => -5}, + }, + 'score' => {function => \&score_bug, + arguments => {'skip_seen|skip-seen!' => 0 + }, + }, + 'mark-spam' => {function => \&mark_spam, + }, + 'mark-ham' => {function => \&mark_ham, + }, + 'help' => {function => sub {pod2usage({verbose => 2});}} + ); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; @@ -98,24 +141,202 @@ $DEBUG = $options{debug}; my @USAGE_ERRORS; $options{verbose} = $options{verbose} - $options{quiet}; -if (not @ARGV) { - push @USAGE_ERRORS, - "You must provide a bug number to examine\n"; -} - pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; +my ($subcommand) = shift @ARGV; +if (not defined $subcommand) { + $subcommand = 'help'; + print STDERR "You must provide a subcommand; displaying usage.\n"; + pod2usage(); +} elsif (not exists $subcommands{$subcommand}) { + print STDERR "$subcommand is not a valid subcommand; displaying usage.\n"; + pod2usage(); +} + if (exists $options{spool_dir} and defined $options{spool_dir}) { $config{spool_dir} = $options{spool_dir}; } -chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!"; +if ($subcommand ne 'help') { + chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!"; +} +my $opts = + handle_subcommand_arguments(\@ARGV, + $subcommands{$subcommand}{arguments}, + $subcommands{$subcommand}{defaults}, + ); +$subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV); + + +sub mark_ham { + mark_it('ham',@_); +} + +sub mark_spam { + mark_it('spam',@_); +} -for my $bug_num (@ARGV) { +sub mark_it { + my ($spam_ham,$options,$opts,$config,$argv) = @_; + my $regex = shift @{$argv}; + for my $bug_num (@{$argv}) { + my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or + die "Unable to open bug log spam for $bug_num"; + foreachmsg(sub { + my ($bn,$rec,$mid) = @_; + my $body = $rec->{text}; + my ($subject) = $body =~ /^Subject: *(.+)$/mi; + my $is_match = 0; + if ($subject =~ /\Q$regex\E/) { + $is_match = 1; + } + if ($mid =~ /\Q$regex\E/) { + $is_match = 1; + } + if ($spam_ham eq 'spam') { + $spam->add_spam($mid); + } else { + $spam->add_ham($mid); + } + }, + $bug_num + ); + } +} + + +sub score_bug { + my ($options,$opts,$config,$argv) = @_; + for my $bug_num (@{$argv}) { + my @bug_score = + spam_score_bug($bug_num, + $options->{spamc}, + $options->{spamc_opts}, + $opts->{skip_seen}, + ); + print "$_->{score} $_->{message_id} $_->{subject}\n" + foreach @bug_score; + } +} + +sub auto_spamscan { + my ($options,$opts,$config,$argv) = @_; + + for my $bug_num (@{$argv}) { + my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or + die "Unable to open bug log spam for $bug_num"; + foreachmsg(sub { + my ($bn,$rec,$mid) = @_; + if ($spam->is_spam($mid)) { + print STDERR "already spam\n" if $DEBUG; + return; + } + if ($spam->is_ham($mid)) { + print STDERR "already ham\n" if $DEBUG; + return; + } + my ($score,$is_spam,$report,$threshold) = + spam_score($rec, + $options->{spamc}, + $options->{spamc_opts}, + ); + if ($is_spam) { + print STDERR "it's spam ($score)\n" if $DEBUG; + $spam->add_spam($mid); + } elsif ($score < $opts->{ham_threshold}) { + print STDERR "it's really ham ($score)\n" if $DEBUG; + $spam->add_ham($mid); + } + else { + print STDERR "it's ham ($score)\n" if $DEBUG; + } + }, + $bug_num, + ); + $spam->save(); + } +} + +sub spam_score_bug { + my ($bug,$spamc,$spamc_opts,$skip_seen) = @_; + + my $spam; + if ($skip_seen) { + $spam = Debbugs::Log::Spam->new(bug_num => $bug) or + die "Unable to open bug log spam for $bug"; + } + my @records; + foreachmsg(sub { + my ($bn,$rec,$mid) = @_; + my $score; + if ($skip_seen) { + if ($spam->is_spam($mid)) { + $score = 999; + } elsif ($spam->is_ham($mid)) { + $score = -999; + } + } + $score //= + spam_score($rec,$spamc,$spamc_opts); + my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi; + push @records, + {message_id => $mid, + score => $score, + subject => $subject, + }; + }, + $bug + ); + return @records; +} + +sub spam_score { + my ($record,$spamc,$spamc_opts) = @_; + my ($score,$threshold,$report); + my $is_spam = 0; + eval { + my ($spamc_in,$spamc_out); + my $old_sig = $SIG{"PIPE"}; + $SIG{"PIPE"} = sub { + die "SIGPIPE in child for some reason"; + }; + my $childpid = + open3($spamc_in,$spamc_out,0, + $spamc,'-E',@{$spamc_opts}) or + die "Unable to fork spamc: $!"; + if (not $childpid) { + die "Unable to fork spamc"; + } + print {$spamc_in} $record->{text}; + close($spamc_in) or die "Unable to close spamc_in: $!"; + waitpid($childpid,0); + if ($? >> 8) { + $is_spam = 1; + } + my ($first_line,@report) = <$spamc_out>; + if ($DEBUG) { + print STDERR "[$?;".($? >> 8)."] "; + print STDERR $first_line,@report; + print STDERR " "; + } + if (defined $first_line) { + chomp $first_line; + ($score,$threshold) = $first_line =~ m{^(-?[\d\.]+)/(-?[\d\.]+)$}; + $report = join('',@report); + } + close($spamc_out); + $SIG{"PIPE"} = $old_sig; + }; + if ($@) { + carp "processing of message failed [$@]\n"; + return undef; + } + return wantarray?($score,$is_spam,$report):$score; +} + +sub foreachmsg { + my ($sub,$bug_num) = @_; my $log = Debbugs::Log->new(bug_num => $bug_num) or die "Unable to open bug log for $bug_num"; - my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or - die "Unable to open bug log spam for $bug_num"; - my %seen_msgids; while (my $record = $log->read_record()) { next if $record->{type} eq 'html'; @@ -123,61 +344,19 @@ for my $bug_num (@ARGV) { my ($msg_id) = record_regex($record, qr/^Message-Id:\s+<(.+)>/mi); next unless defined $msg_id; + print STDERR "examining $msg_id: " if $DEBUG; if ($msg_id =~ /$config{email_domain}$/) { - print STDERR "skipping $msg_id\n" if $DEBUG; + print STDERR "skipping\n" if $DEBUG; next; } - print STDERR "examining $msg_id: " if $DEBUG; if ($seen_msgids{$msg_id}) { print STDERR "already seen\n" if $DEBUG; next; } $seen_msgids{$msg_id}=1; - if ($spam->is_spam($msg_id)) { - print STDERR "already spam\n" if $DEBUG; - next; - } - my $is_spam; - eval { - my ($spamc,$child_out); - my $old_sig = $SIG{"PIPE"}; - $SIG{"PIPE"} = sub { - die "SIGPIPE in child for some reason"; - }; - my $childpid = - open3($spamc,$child_out,0, - $options{spamc},'-E',@{$options{spamc_opts}}) or - die "Unable to fork spamc: $!"; - if (not $childpid) { - die "Unable to fork spamc"; - } - print {$spamc} $record->{text}; - close($spamc) or die "Unable to close spamc: $!"; - waitpid($childpid,0); - if ($DEBUG) { - print STDERR "[$?;".($? >> 8)."] "; - print STDERR map {s/\n//; $_ } <$child_out>; - print STDERR " "; - } - close($child_out); - $SIG{"PIPE"} = $old_sig; - if ($? >> 8) { - $is_spam = 1; - } - }; - if ($@) { - print STDERR "processing of $msg_id failed [$@]\n"; - } else { - if ($is_spam) { - print STDERR "it's spam\n" if $DEBUG; - $spam->add_spam($msg_id); - } - else { - print STDERR "it's ham\n" if $DEBUG; - } - } + $sub->($bug_num,$record,$msg_id); + print STDERR "\n" if $DEBUG; } - $spam->save(); }