]> git.donarmstrong.com Git - debbugs.git/blob - bin/debbugs-spam
bf130af1dfd92917ded5cfab4f08bd9068fe3210
[debbugs.git] / bin / debbugs-spam
1 #! /usr/bin/perl
2 # debbugs-spam is part of debbugs, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2012 by Don Armstrong <don@donarmstrong.com>.
6
7
8 use warnings;
9 use strict;
10
11 use Getopt::Long qw(:config no_ignore_case);
12 use Pod::Usage;
13
14 =head1 NAME
15
16 debbugs-spam -- Scan log files for spam and populate nnn.log.spam
17
18 =head1 SYNOPSIS
19
20 debbugs-spam [options] bugnumber [[bugnumber2]..]
21
22  Options:
23   --spool-dir debbugs spool directory
24   --debug, -d debugging level (Default 0)
25   --help, -h display this help
26   --man, -m display manual
27
28 =head1 OPTIONS
29
30 =over
31
32 =item B<--spool-dir>
33
34 Debbugs spool directory; defaults to the value configured in the
35 debbugs configuration file.
36
37 =item B<--debug, -d>
38
39 Debug verbosity.
40
41 =item B<--help, -h>
42
43 Display brief useage information.
44
45 =item B<--man, -m>
46
47 Display this manual.
48
49 =back
50
51 =head1 SUBCOMMANDS
52
53 =over
54
55 =item B<auto-scan>
56
57 Automatically scan messages using spamassassin and mark messages as
58 spam which hit the threshold, and those that are highly negative as
59 ham.
60
61 =item B<score>
62
63 Output the score of all of the messages in a bug
64
65 =over
66
67 =item B<--skip-seen> Skip messages which have previously been classified
68
69 =back
70
71 =item B<mark-spam>
72
73 Mark messages as spam if there is a regex match to subject or message
74 id
75
76 =item B<mark-ham>
77
78 Mark messages as ham if there is a regex match to subject or message
79 id
80
81 =item B<learn>
82
83 Learn from messages which have been marked as spam/ham
84
85 =back
86
87
88 =head1 EXAMPLES
89
90
91 =cut
92
93
94 use vars qw($DEBUG);
95
96 use Debbugs::Log qw(record_regex);
97 use Debbugs::Log::Spam;
98 use Debbugs::Config qw(:config);
99 use Debbugs::Command qw(:all);
100 use Debbugs::Common qw(getbuglocation);
101 use IPC::Open3 qw(open3);
102 use Carp;
103
104 my %options =
105     (debug   => 0,
106      help    => 0,
107      man     => 0,
108      verbose => 0,
109      quiet   => 0,
110      spamc   => 'spamc',
111      spamc_opts => [],
112     );
113
114 handle_main_arguments(\%options,
115                       'service|s',
116                       'sysconfdir|c',
117                       'spamc=s' => 0,
118                       'spamc_opts|spamc-opts=s@' => 0,
119                       'spool_dir|spool-dir=s',
120                       'quiet|q:+',
121                       'verbose|v:+',
122                       'debug|d+','help|h|?','man|m');
123
124 my %subcommands =
125     ('auto-scan' => {function => \&auto_spamscan,
126                      arguments => {'ham_threshold|ham-threshold=s' => 0,
127                                    'skip_missing|skip-missing!' => 0,
128                                   },
129                      defaults => {ham_threshold => -5,
130                                   skip_missing => 0,
131                                  },
132                     },
133      'score' => {function => \&score_bug,
134                  arguments => {'skip_seen|skip-seen!' => 0
135                               },
136                 },
137      'mark-spam' => {function => \&mark_spam,
138                     },
139      'mark-ham' => {function => \&mark_ham,
140                    },
141      'learn' => {function => \&learn,
142                 },
143      'help' => {function => sub {pod2usage({verbose => 2});}}
144     );
145
146 pod2usage() if $options{help};
147 pod2usage({verbose=>2}) if $options{man};
148
149 $DEBUG = $options{debug};
150
151 my @USAGE_ERRORS;
152 $options{verbose} = $options{verbose} - $options{quiet};
153
154 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
155
156 my ($subcommand) = shift @ARGV;
157 if (not defined $subcommand) {
158     $subcommand = 'help';
159     print STDERR "You must provide a subcommand; displaying usage.\n";
160     pod2usage();
161 } elsif (not exists $subcommands{$subcommand}) {
162     print STDERR "$subcommand is not a valid subcommand; displaying usage.\n";
163     pod2usage();
164 }
165
166 if (exists $options{spool_dir} and defined $options{spool_dir}) {
167     $config{spool_dir} = $options{spool_dir};
168 }
169 if ($subcommand ne 'help') {
170     chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
171 }
172 my $opts =
173     handle_subcommand_arguments(\@ARGV,
174                                 $subcommands{$subcommand}{arguments},
175                                 $subcommands{$subcommand}{defaults},
176                                );
177 $subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV);
178
179
180 sub mark_ham {
181     mark_it('ham',@_);
182 }
183
184 sub mark_spam {
185     mark_it('spam',@_);
186 }
187
188 sub mark_it {
189     my ($spam_ham,$options,$opts,$config,$argv) = @_;
190     my $regex = shift @{$argv};
191     for my $bug_num (@{$argv}) {
192         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
193             die "Unable to open bug log spam for $bug_num";
194         foreachmsg(sub {
195                        my ($bn,$rec,$mid) = @_;
196                        my $body = $rec->{text};
197                        my ($subject) = $body =~ /^Subject: *(.+)$/mi;
198                        my $is_match = 0;
199                        if ($subject =~ /\Q$regex\E/) {
200                            $is_match = 1;
201                        }
202                        if ($mid =~ /\Q$regex\E/) {
203                            $is_match = 1;
204                        }
205                        if ($is_match) {
206                            print STDERR "it's a match" if $DEBUG;
207                            if ($spam_ham eq 'spam') {
208                                $spam->add_spam($mid);
209                            } else {
210                                $spam->add_ham($mid);
211                            }
212                        }
213                    },
214                    $bug_num
215                   );
216         $spam->save();
217     }
218 }
219
220
221 sub score_bug {
222     my ($options,$opts,$config,$argv) = @_;
223     for my $bug_num (@{$argv}) {
224         my @bug_score =
225             spam_score_bug($bug_num,
226                            $options->{spamc},
227                            $options->{spamc_opts},
228                            $opts->{skip_seen},
229                           );
230         print "$_->{score} $_->{message_id} $_->{subject}\n"
231             foreach @bug_score;
232     }
233 }
234
235 sub learn {
236     my ($options,$opts,$config,$argv) = @_;
237
238     for my $bug_num (@{$argv}) {
239         if ($opts->{skip_missing} and
240             not defined getbuglocation($bug_num,'log')) {
241             print STDERR "bug $bug_num does not exist\n" if $options->{verbose} > -1;
242             next;
243         }
244         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
245             die "Unable to open bug log spam for $bug_num";
246         foreachmsg(sub {
247                        my ($bn,$rec,$mid) = @_;
248                        if ($spam->is_spam($mid)) {
249                            spamc_learn_spam($rec,$options->{spamc},$options->{spamc_opts});
250                            return;
251                        }
252                        if ($spam->is_ham($mid)) {
253                            spamc_learn_ham($rec,$options->{spamc},$options->{spamc_opts});
254                            return;
255                        }
256                    },
257                    $bug_num,
258                   );
259         $spam->save();
260     }
261 }
262
263 sub auto_spamscan {
264     my ($options,$opts,$config,$argv) = @_;
265
266     for my $bug_num (@{$argv}) {
267         if ($opts->{skip_missing} and
268             not defined getbuglocation($bug_num,'log')) {
269             print STDERR "bug $bug_num does not exist\n" if $options->{verbose} > -1;
270             next;
271         }
272         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
273             die "Unable to open bug log spam for $bug_num";
274         foreachmsg(sub {
275                        my ($bn,$rec,$mid) = @_;
276                        if ($spam->is_spam($mid)) {
277                            print STDERR "already spam\n" if $DEBUG;
278                            return;
279                        }
280                        if ($spam->is_ham($mid)) {
281                            print STDERR "already ham\n" if $DEBUG;
282                            return;
283                        }
284                        my ($score,$is_spam,$report,$threshold) =
285                            spam_score($rec,
286                                       $options->{spamc},
287                                       $options->{spamc_opts},
288                                      );
289                        if ($is_spam) {
290                            print STDERR "it's spam ($score)\n" if $DEBUG;
291                            $spam->add_spam($mid);
292                        } elsif ($score < $opts->{ham_threshold}) {
293                            print STDERR "it's really ham ($score)\n" if $DEBUG;
294                            $spam->add_ham($mid);
295                        }
296                        else {
297                            print STDERR "it's ham ($score)\n" if $DEBUG;
298                        }
299                    },
300                    $bug_num,
301                   );
302         $spam->save();
303     }
304 }
305
306 sub spam_score_bug {
307     my ($bug,$spamc,$spamc_opts,$skip_seen) = @_;
308
309     my $spam;
310     if ($skip_seen) {
311         $spam = Debbugs::Log::Spam->new(bug_num => $bug) or
312             die "Unable to open bug log spam for $bug";
313     }
314     my @records;
315     foreachmsg(sub {
316                    my ($bn,$rec,$mid) = @_;
317                    my $score;
318                    if ($skip_seen) {
319                        if ($spam->is_spam($mid)) {
320                            $score = 999;
321                        } elsif ($spam->is_ham($mid)) {
322                            $score = -999;
323                        }
324                    }
325                    $score //=
326                        spam_score($rec,$spamc,$spamc_opts);
327                    my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi;
328                    push @records,
329                       {message_id => $mid,
330                        score => $score,
331                        subject => $subject,
332                       };
333                },
334                $bug
335               );
336     return @records;
337 }
338
339 sub spamc_bug {
340     my ($record,$spamc,$spamc_opts) = @_;
341     my $first_line = '';
342     my $report = '';
343     my $exit_code = 0;
344     eval {
345         my ($spamc_in,$spamc_out);
346         my $old_sig = $SIG{"PIPE"};
347         $SIG{"PIPE"} = sub {
348             die "SIGPIPE in child for some reason";
349         };
350         my $childpid =
351             open3($spamc_in,$spamc_out,0,
352                   $spamc,@{$spamc_opts}) or
353                       die "Unable to fork spamc: $!";
354         if (not $childpid) {
355             die "Unable to fork spamc";
356         }
357         print {$spamc_in} $record->{text};
358         close($spamc_in) or die "Unable to close spamc_in: $!";
359         waitpid($childpid,0);
360         if ($? >> 8) {
361             $exit_code = $? >> 8;
362         }
363         local $/;
364         $report = <$spamc_out>;
365         close($spamc_out);
366         $SIG{"PIPE"} = $old_sig;
367     };
368     if ($@) {
369         carp "processing of message failed [$@]\n";
370         return undef;
371     }
372     return ($exit_code,$report);
373 }
374
375 sub spam_score {
376     my ($record,$spamc,$spamc_opts) = @_;
377     my ($score,$threshold,$report,$exit_code);
378     ($exit_code,$report) =
379         spamc_bug($record,$spamc,[@{$spamc_opts},'-c']);
380     if (defined $report) {
381         ($score,$threshold) = $report =~ s{^(-?[\d\.]+)/(-?[\d\.]+)\n?}{};
382     }
383     return wantarray?($score,$exit_code,$report):$score;
384 }
385
386 sub spamc_learn_ham {
387     spamc_learn('ham',@_);
388 }
389
390 sub spamc_learn_forget {
391     spamc_learn('forget',@_);
392 }
393
394 sub spamc_learn_spam {
395     spamc_learn('spam',@_);
396 }
397
398 sub spamc_learn {
399     my ($type,$record,$spamc,$spamc_opts) = @_;
400     spamc_bug($record,$spamc,[@{$spamc_opts},'-L',$type])
401 }
402
403 sub foreachmsg {
404     my ($sub,$bug_num) = @_;
405     my $log = Debbugs::Log->new(bug_num => $bug_num) or
406         die "Unable to open bug log for $bug_num";
407     my %seen_msgids;
408     while (my $record = $log->read_record()) {
409         next if $record->{type} eq 'html';
410         next if $record->{type} eq 'autocheck';
411         my ($msg_id) = record_regex($record,
412                                     qr/^Message-Id:\s+<(.+)>/mi);
413         next unless defined $msg_id;
414         print STDERR "examining $msg_id: " if $DEBUG;
415         if ($msg_id =~ /$config{email_domain}$/) {
416             print STDERR "skipping\n" if $DEBUG;
417             next;
418         }
419         if ($seen_msgids{$msg_id}) {
420             print STDERR "already seen\n" if $DEBUG;
421             next;
422         }
423         $seen_msgids{$msg_id}=1;
424         $sub->($bug_num,$record,$msg_id);
425         print STDERR "\n" if $DEBUG;
426     }
427 }
428
429
430 __END__
431
432 # Local Variables:
433 # cperl-indent-level: 4
434 # indent-tabs-mode: nil
435 # End: