]> git.donarmstrong.com Git - debbugs.git/blob - bin/debbugs-spam
remove quick option and add quiet option
[debbugs.git] / bin / debbugs-spam
1 #! /usr/bin/perl
2 # debbugs-spam is part of debbugs, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2012 by Don Armstrong <don@donarmstrong.com>.
6
7
8 use warnings;
9 use strict;
10
11 use Getopt::Long qw(:config no_ignore_case);
12 use Pod::Usage;
13
14 =head1 NAME
15
16 debbugs-spam -- Scan log files for spam and populate nnn.log.spam
17
18 =head1 SYNOPSIS
19
20 debbugs-spam [options] bugnumber [[bugnumber2]..]
21
22  Options:
23   --spool-dir debbugs spool directory
24   --debug, -d debugging level (Default 0)
25   --help, -h display this help
26   --man, -m display manual
27
28 =head1 OPTIONS
29
30 =over
31
32 =item B<--spool-dir>
33
34 Debbugs spool directory; defaults to the value configured in the
35 debbugs configuration file.
36
37 =item B<--debug, -d>
38
39 Debug verbosity.
40
41 =item B<--help, -h>
42
43 Display brief useage information.
44
45 =item B<--man, -m>
46
47 Display this manual.
48
49 =back
50
51 =head1 SUBCOMMANDS
52
53 =over
54
55 =item B<auto-scan>
56
57 Automatically scan messages using spamassassin and mark messages as
58 spam which hit the threshold, and those that are highly negative as
59 ham.
60
61 =item B<score>
62
63 Output the score of all of the messages in a bug
64
65 =over
66
67 =item B<--skip-seen> Skip messages which have previously been classified
68
69 =back
70
71 =item B<mark-spam>
72
73 Mark messages as spam if there is a regex match to subject or message
74 id
75
76 =item B<mark-ham>
77
78 Mark messages as ham if there is a regex match to subject or message
79 id
80
81 =back
82
83
84 =head1 EXAMPLES
85
86
87 =cut
88
89
90 use vars qw($DEBUG);
91
92 use Debbugs::Log qw(record_regex);
93 use Debbugs::Log::Spam;
94 use Debbugs::Config qw(:config);
95 use Debbugs::Command qw(:all);
96 use Debbugs::Common qw(getbuglocation);
97 use IPC::Open3 qw(open3);
98 use Carp;
99
100 my %options =
101     (debug   => 0,
102      help    => 0,
103      man     => 0,
104      verbose => 0,
105      quiet   => 0,
106      spamc   => 'spamc',
107      spamc_opts => [],
108     );
109
110 handle_main_arguments(\%options,
111                       'service|s',
112                       'sysconfdir|c',
113                       'spamc=s' => 0,
114                       'spamc_opts|spamc-opts=s@' => 0,
115                       'spool_dir|spool-dir=s',
116                       'quiet|q:+',
117                       'verbose|v:+',
118                       'debug|d+','help|h|?','man|m');
119
120 my %subcommands =
121     ('auto-scan' => {function => \&auto_spamscan,
122                      arguments => {'ham_threshold|ham-threshold=s' => 0,
123                                    'skip_missing|skip-missing!' => 0,
124                                   },
125                      defaults => {ham_threshold => -5,
126                                   skip_missing => 0,
127                                  },
128                     },
129      'score' => {function => \&score_bug,
130                  arguments => {'skip_seen|skip-seen!' => 0
131                               },
132                 },
133      'mark-spam' => {function => \&mark_spam,
134                     },
135      'mark-ham' => {function => \&mark_ham,
136                    },
137      'help' => {function => sub {pod2usage({verbose => 2});}}
138     );
139
140 pod2usage() if $options{help};
141 pod2usage({verbose=>2}) if $options{man};
142
143 $DEBUG = $options{debug};
144
145 my @USAGE_ERRORS;
146 $options{verbose} = $options{verbose} - $options{quiet};
147
148 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
149
150 my ($subcommand) = shift @ARGV;
151 if (not defined $subcommand) {
152     $subcommand = 'help';
153     print STDERR "You must provide a subcommand; displaying usage.\n";
154     pod2usage();
155 } elsif (not exists $subcommands{$subcommand}) {
156     print STDERR "$subcommand is not a valid subcommand; displaying usage.\n";
157     pod2usage();
158 }
159
160 if (exists $options{spool_dir} and defined $options{spool_dir}) {
161     $config{spool_dir} = $options{spool_dir};
162 }
163 if ($subcommand ne 'help') {
164     chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
165 }
166 my $opts =
167     handle_subcommand_arguments(\@ARGV,
168                                 $subcommands{$subcommand}{arguments},
169                                 $subcommands{$subcommand}{defaults},
170                                );
171 $subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV);
172
173
174 sub mark_ham {
175     mark_it('ham',@_);
176 }
177
178 sub mark_spam {
179     mark_it('spam',@_);
180 }
181
182 sub mark_it {
183     my ($spam_ham,$options,$opts,$config,$argv) = @_;
184     my $regex = shift @{$argv};
185     for my $bug_num (@{$argv}) {
186         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
187             die "Unable to open bug log spam for $bug_num";
188         foreachmsg(sub {
189                        my ($bn,$rec,$mid) = @_;
190                        my $body = $rec->{text};
191                        my ($subject) = $body =~ /^Subject: *(.+)$/mi;
192                        my $is_match = 0;
193                        if ($subject =~ /\Q$regex\E/) {
194                            $is_match = 1;
195                        }
196                        if ($mid =~ /\Q$regex\E/) {
197                            $is_match = 1;
198                        }
199                        if ($is_match) {
200                            print STDERR "it's a match" if $DEBUG;
201                            if ($spam_ham eq 'spam') {
202                                $spam->add_spam($mid);
203                            } else {
204                                $spam->add_ham($mid);
205                            }
206                        }
207                    },
208                    $bug_num
209                   );
210         $spam->save();
211     }
212 }
213
214
215 sub score_bug {
216     my ($options,$opts,$config,$argv) = @_;
217     for my $bug_num (@{$argv}) {
218         my @bug_score =
219             spam_score_bug($bug_num,
220                            $options->{spamc},
221                            $options->{spamc_opts},
222                            $opts->{skip_seen},
223                           );
224         print "$_->{score} $_->{message_id} $_->{subject}\n"
225             foreach @bug_score;
226     }
227 }
228
229 sub auto_spamscan {
230     my ($options,$opts,$config,$argv) = @_;
231
232     for my $bug_num (@{$argv}) {
233         if ($opts->{skip_missing} and
234             not defined getbuglocation($bug_num,'log')) {
235             print STDERR "bug $bug_num does not exist\n" if $options->{verbose} > -1;
236             next;
237         }
238         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
239             die "Unable to open bug log spam for $bug_num";
240         foreachmsg(sub {
241                        my ($bn,$rec,$mid) = @_;
242                        if ($spam->is_spam($mid)) {
243                            print STDERR "already spam\n" if $DEBUG;
244                            return;
245                        }
246                        if ($spam->is_ham($mid)) {
247                            print STDERR "already ham\n" if $DEBUG;
248                            return;
249                        }
250                        my ($score,$is_spam,$report,$threshold) =
251                            spam_score($rec,
252                                       $options->{spamc},
253                                       $options->{spamc_opts},
254                                      );
255                        if ($is_spam) {
256                            print STDERR "it's spam ($score)\n" if $DEBUG;
257                            $spam->add_spam($mid);
258                        } elsif ($score < $opts->{ham_threshold}) {
259                            print STDERR "it's really ham ($score)\n" if $DEBUG;
260                            $spam->add_ham($mid);
261                        }
262                        else {
263                            print STDERR "it's ham ($score)\n" if $DEBUG;
264                        }
265                    },
266                    $bug_num,
267                   );
268         $spam->save();
269     }
270 }
271
272 sub spam_score_bug {
273     my ($bug,$spamc,$spamc_opts,$skip_seen) = @_;
274
275     my $spam;
276     if ($skip_seen) {
277         $spam = Debbugs::Log::Spam->new(bug_num => $bug) or
278             die "Unable to open bug log spam for $bug";
279     }
280     my @records;
281     foreachmsg(sub {
282                    my ($bn,$rec,$mid) = @_;
283                    my $score;
284                    if ($skip_seen) {
285                        if ($spam->is_spam($mid)) {
286                            $score = 999;
287                        } elsif ($spam->is_ham($mid)) {
288                            $score = -999;
289                        }
290                    }
291                    $score //=
292                        spam_score($rec,$spamc,$spamc_opts);
293                    my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi;
294                    push @records,
295                       {message_id => $mid,
296                        score => $score,
297                        subject => $subject,
298                       };
299                },
300                $bug
301               );
302     return @records;
303 }
304
305 sub spam_score {
306     my ($record,$spamc,$spamc_opts) = @_;
307     my ($score,$threshold,$report);
308     my $is_spam = 0;
309     eval {
310         my ($spamc_in,$spamc_out);
311         my $old_sig = $SIG{"PIPE"};
312         $SIG{"PIPE"} = sub {
313             die "SIGPIPE in child for some reason";
314         };
315         my $childpid =
316             open3($spamc_in,$spamc_out,0,
317                   $spamc,'-E',@{$spamc_opts}) or
318                       die "Unable to fork spamc: $!";
319         if (not $childpid) {
320             die "Unable to fork spamc";
321         }
322         print {$spamc_in} $record->{text};
323         close($spamc_in) or die "Unable to close spamc_in: $!";
324         waitpid($childpid,0);
325         if ($? >> 8) {
326             $is_spam = 1;
327         }
328         my ($first_line,@report) = <$spamc_out>;
329         if ($DEBUG) {
330             print STDERR "[$?;".($? >> 8)."] ";
331             print STDERR $first_line,@report;
332             print STDERR " ";
333         }
334         if (defined $first_line) {
335             chomp $first_line;
336             ($score,$threshold) = $first_line =~ m{^(-?[\d\.]+)/(-?[\d\.]+)$};
337             $report = join('',@report);
338         }
339         close($spamc_out);
340         $SIG{"PIPE"} = $old_sig;
341     };
342     if ($@) {
343         carp "processing of message failed [$@]\n";
344         return undef;
345     }
346     return wantarray?($score,$is_spam,$report):$score;
347 }
348
349 sub foreachmsg {
350     my ($sub,$bug_num) = @_;
351     my $log = Debbugs::Log->new(bug_num => $bug_num) or
352         die "Unable to open bug log for $bug_num";
353     my %seen_msgids;
354     while (my $record = $log->read_record()) {
355         next if $record->{type} eq 'html';
356         next if $record->{type} eq 'autocheck';
357         my ($msg_id) = record_regex($record,
358                                     qr/^Message-Id:\s+<(.+)>/mi);
359         next unless defined $msg_id;
360         print STDERR "examining $msg_id: " if $DEBUG;
361         if ($msg_id =~ /$config{email_domain}$/) {
362             print STDERR "skipping\n" if $DEBUG;
363             next;
364         }
365         if ($seen_msgids{$msg_id}) {
366             print STDERR "already seen\n" if $DEBUG;
367             next;
368         }
369         $seen_msgids{$msg_id}=1;
370         $sub->($bug_num,$record,$msg_id);
371         print STDERR "\n" if $DEBUG;
372     }
373 }
374
375
376 __END__
377
378 # Local Variables:
379 # cperl-indent-level: 4
380 # indent-tabs-mode: nil
381 # End: