]> git.donarmstrong.com Git - debbugs.git/blob - bin/debbugs-spam
strip of the envelope from
[debbugs.git] / bin / debbugs-spam
1 #! /usr/bin/perl
2 # debbugs-spam is part of debbugs, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2012 by Don Armstrong <don@donarmstrong.com>.
6
7
8 use warnings;
9 use strict;
10
11 use Getopt::Long qw(:config no_ignore_case);
12 use Pod::Usage;
13
14 =head1 NAME
15
16 debbugs-spam -- Scan log files for spam and populate nnn.log.spam
17
18 =head1 SYNOPSIS
19
20 debbugs-spam [options] bugnumber [[bugnumber2]..]
21
22  Options:
23   --spool-dir debbugs spool directory
24   --debug, -d debugging level (Default 0)
25   --help, -h display this help
26   --man, -m display manual
27
28 =head1 OPTIONS
29
30 =over
31
32 =item B<--spool-dir>
33
34 Debbugs spool directory; defaults to the value configured in the
35 debbugs configuration file.
36
37 =item B<--debug, -d>
38
39 Debug verbosity.
40
41 =item B<--help, -h>
42
43 Display brief useage information.
44
45 =item B<--man, -m>
46
47 Display this manual.
48
49 =back
50
51 =head1 SUBCOMMANDS
52
53 =over
54
55 =item B<auto-scan>
56
57 Automatically scan messages using spamassassin and mark messages as
58 spam which hit the threshold, and those that are highly negative as
59 ham.
60
61 =item B<score>
62
63 Output the score of all of the messages in a bug
64
65 =over
66
67 =item B<--skip-seen> Skip messages which have previously been classified
68
69 =back
70
71 =item B<mark-spam>
72
73 Mark messages as spam if there is a regex match to subject or message
74 id
75
76 =item B<mark-ham>
77
78 Mark messages as ham if there is a regex match to subject or message
79 id
80
81 =item B<learn>
82
83 Learn from messages which have been marked as spam/ham
84
85 =back
86
87
88 =head1 EXAMPLES
89
90
91 =cut
92
93
94 use vars qw($DEBUG);
95
96 use Debbugs::Log qw(record_regex);
97 use Debbugs::Log::Spam;
98 use Debbugs::Config qw(:config);
99 use Debbugs::Command qw(:all);
100 use Debbugs::Common qw(getbuglocation);
101 use IPC::Open3 qw(open3);
102 use Carp;
103
104 my %options =
105     (debug   => 0,
106      help    => 0,
107      man     => 0,
108      verbose => 0,
109      quiet   => 0,
110      spamc   => 'spamc',
111      spamc_opts => [],
112     );
113
114 handle_main_arguments(\%options,
115                       'service|s',
116                       'sysconfdir|c',
117                       'spamc=s' => 0,
118                       'spamc_opts|spamc-opts=s@' => 0,
119                       'spool_dir|spool-dir=s',
120                       'quiet|q:+',
121                       'verbose|v:+',
122                       'debug|d+','help|h|?','man|m');
123
124 my %subcommands =
125     ('auto-scan' => {function => \&auto_spamscan,
126                      arguments => {'ham_threshold|ham-threshold=s' => 0,
127                                    'skip_missing|skip-missing!' => 0,
128                                   },
129                      defaults => {ham_threshold => -5,
130                                   skip_missing => 0,
131                                  },
132                     },
133      'score' => {function => \&score_bug,
134                  arguments => {'skip_seen|skip-seen!' => 0
135                               },
136                 },
137      'mark-spam' => {function => \&mark_spam,
138                     },
139      'mark-ham' => {function => \&mark_ham,
140                    },
141      'learn' => {function => \&learn,
142                  arguments => {'skip_missing|skip-missing!' => 0,
143                               },
144                  defaults => {skip_missing => 0,
145                              },
146                 },
147      'help' => {function => sub {pod2usage({verbose => 2});}}
148     );
149
150 pod2usage() if $options{help};
151 pod2usage({verbose=>2}) if $options{man};
152
153 $DEBUG = $options{debug};
154
155 my @USAGE_ERRORS;
156 $options{verbose} = $options{verbose} - $options{quiet};
157
158 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
159
160 my ($subcommand) = shift @ARGV;
161 if (not defined $subcommand) {
162     $subcommand = 'help';
163     print STDERR "You must provide a subcommand; displaying usage.\n";
164     pod2usage();
165 } elsif (not exists $subcommands{$subcommand}) {
166     print STDERR "$subcommand is not a valid subcommand; displaying usage.\n";
167     pod2usage();
168 }
169
170 if (exists $options{spool_dir} and defined $options{spool_dir}) {
171     $config{spool_dir} = $options{spool_dir};
172 }
173 if ($subcommand ne 'help') {
174     chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
175 }
176 my $opts =
177     handle_subcommand_arguments(\@ARGV,
178                                 $subcommands{$subcommand}{arguments},
179                                 $subcommands{$subcommand}{defaults},
180                                );
181 $subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV);
182
183
184 sub mark_ham {
185     mark_it('ham',@_);
186 }
187
188 sub mark_spam {
189     mark_it('spam',@_);
190 }
191
192 sub mark_it {
193     my ($spam_ham,$options,$opts,$config,$argv) = @_;
194     my $regex = shift @{$argv};
195     for my $bug_num (@{$argv}) {
196         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
197             die "Unable to open bug log spam for $bug_num";
198         foreachmsg(sub {
199                        my ($bn,$rec,$mid) = @_;
200                        my $body = $rec->{text};
201                        my ($subject) = $body =~ /^Subject: *(.+)$/mi;
202                        my $is_match = 0;
203                        if ($subject =~ /\Q$regex\E/) {
204                            $is_match = 1;
205                        }
206                        if ($mid =~ /\Q$regex\E/) {
207                            $is_match = 1;
208                        }
209                        if ($is_match) {
210                            print STDERR "it's a match" if $DEBUG;
211                            if ($spam_ham eq 'spam') {
212                                $spam->add_spam($mid);
213                            } else {
214                                $spam->add_ham($mid);
215                            }
216                        }
217                    },
218                    $bug_num
219                   );
220         $spam->save();
221     }
222 }
223
224
225 sub score_bug {
226     my ($options,$opts,$config,$argv) = @_;
227     for my $bug_num (@{$argv}) {
228         my @bug_score =
229             spam_score_bug($bug_num,
230                            $options->{spamc},
231                            $options->{spamc_opts},
232                            $opts->{skip_seen},
233                           );
234         print "$_->{score} $_->{message_id} $_->{subject}\n"
235             foreach @bug_score;
236     }
237 }
238
239 sub learn {
240     my ($options,$opts,$config,$argv) = @_;
241
242     for my $bug_num (@{$argv}) {
243         if ($opts->{skip_missing} and
244             not defined getbuglocation($bug_num,'log')) {
245             print STDERR "bug $bug_num does not exist\n" if $options->{verbose} > -1;
246             next;
247         }
248         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
249             die "Unable to open bug log spam for $bug_num";
250         foreachmsg(sub {
251                        my ($bn,$rec,$mid) = @_;
252                        if ($spam->is_spam($mid)) {
253                            spamc_learn_spam($rec,$options->{spamc},$options->{spamc_opts});
254                            return;
255                        }
256                        if ($spam->is_ham($mid)) {
257                            spamc_learn_ham($rec,$options->{spamc},$options->{spamc_opts});
258                            return;
259                        }
260                    },
261                    $bug_num,
262                   );
263         $spam->save();
264     }
265 }
266
267 sub auto_spamscan {
268     my ($options,$opts,$config,$argv) = @_;
269
270     for my $bug_num (@{$argv}) {
271         if ($opts->{skip_missing} and
272             not defined getbuglocation($bug_num,'log')) {
273             print STDERR "bug $bug_num does not exist\n" if $options->{verbose} > -1;
274             next;
275         }
276         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
277             die "Unable to open bug log spam for $bug_num";
278         foreachmsg(sub {
279                        my ($bn,$rec,$mid) = @_;
280                        if ($spam->is_spam($mid)) {
281                            print STDERR "already spam\n" if $DEBUG;
282                            return;
283                        }
284                        if ($spam->is_ham($mid)) {
285                            print STDERR "already ham\n" if $DEBUG;
286                            return;
287                        }
288                        my ($score,$is_spam,$report,$threshold) =
289                            spam_score($rec,
290                                       $options->{spamc},
291                                       $options->{spamc_opts},
292                                      );
293                        if ($is_spam) {
294                            print STDERR "it's spam ($score)\n" if $DEBUG;
295                            $spam->add_spam($mid);
296                        } elsif ($score < $opts->{ham_threshold}) {
297                            print STDERR "it's really ham ($score)\n" if $DEBUG;
298                            $spam->add_ham($mid);
299                        }
300                        else {
301                            print STDERR "it's ham ($score)\n" if $DEBUG;
302                        }
303                    },
304                    $bug_num,
305                   );
306         $spam->save();
307     }
308 }
309
310 sub spam_score_bug {
311     my ($bug,$spamc,$spamc_opts,$skip_seen) = @_;
312
313     my $spam;
314     if ($skip_seen) {
315         $spam = Debbugs::Log::Spam->new(bug_num => $bug) or
316             die "Unable to open bug log spam for $bug";
317     }
318     my @records;
319     foreachmsg(sub {
320                    my ($bn,$rec,$mid) = @_;
321                    my $score;
322                    if ($skip_seen) {
323                        if ($spam->is_spam($mid)) {
324                            $score = 999;
325                        } elsif ($spam->is_ham($mid)) {
326                            $score = -999;
327                        }
328                    }
329                    $score //=
330                        spam_score($rec,$spamc,$spamc_opts);
331                    my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi;
332                    push @records,
333                       {message_id => $mid,
334                        score => $score,
335                        subject => $subject,
336                       };
337                },
338                $bug
339               );
340     return @records;
341 }
342
343 sub add_return_path {
344     my ($message) = @_;
345     ## debbugs is kind of odd, and puts "Received:" first, them "From
346     ## ", and doesn't have a Return-Path. Fix that up so spamassassin
347     ## is happy.
348     $message =~
349         s{^(Received: \(at \S+\) by \S+;[^\n]+\n)(From (\S+) [^\n]+\n)}
350         {Return-path: $3\n$1};
351     return $message;
352 }
353
354 sub spamc_bug {
355     my ($record,$spamc,$spamc_opts) = @_;
356     my $first_line = '';
357     my $report = '';
358     my $exit_code = 0;
359     eval {
360         my ($spamc_in,$spamc_out);
361         my $old_sig = $SIG{"PIPE"};
362         $SIG{"PIPE"} = sub {
363             die "SIGPIPE in child for some reason";
364         };
365         my $childpid =
366             open3($spamc_in,$spamc_out,0,
367                   $spamc,@{$spamc_opts}) or
368                       die "Unable to fork spamc: $!";
369         if (not $childpid) {
370             die "Unable to fork spamc";
371         }
372         print STDERR add_return_path($record->{text}) if $DEBUG > 1;
373         print {$spamc_in} add_return_path($record->{text});
374         close($spamc_in) or die "Unable to close spamc_in: $!";
375         waitpid($childpid,0);
376         if ($? >> 8) {
377             $exit_code = $? >> 8;
378         }
379         local $/;
380         $report = <$spamc_out>;
381         close($spamc_out);
382         $SIG{"PIPE"} = $old_sig;
383     };
384     if ($@) {
385         carp "processing of message failed [$@]\n";
386         return undef;
387     }
388     return ($exit_code,$report);
389 }
390
391 sub spam_score {
392     my ($record,$spamc,$spamc_opts) = @_;
393     my ($score,$threshold,$report,$exit_code);
394     ($exit_code,$report) =
395         spamc_bug($record,$spamc,[@{$spamc_opts},'-c']);
396     if (defined $report) {
397         ($score,$threshold) = $report =~ s{^(-?[\d\.]+)/(-?[\d\.]+)\n?}{};
398     }
399     return wantarray?($score,$exit_code,$report):$score;
400 }
401
402 sub spamc_learn_ham {
403     spamc_learn('ham',@_);
404 }
405
406 sub spamc_learn_forget {
407     spamc_learn('forget',@_);
408 }
409
410 sub spamc_learn_spam {
411     spamc_learn('spam',@_);
412 }
413
414 sub spamc_learn {
415     my ($type,$record,$spamc,$spamc_opts) = @_;
416     spamc_bug($record,$spamc,[@{$spamc_opts},'-L',$type])
417 }
418
419 sub foreachmsg {
420     my ($sub,$bug_num) = @_;
421     my $log = Debbugs::Log->new(bug_num => $bug_num) or
422         die "Unable to open bug log for $bug_num";
423     my %seen_msgids;
424     while (my $record = $log->read_record()) {
425         next unless $record->{type} eq 'incoming-recv';
426         my ($msg_id) = record_regex($record,
427                                     qr/^Message-Id:\s+<(.+)>/mi);
428         next unless defined $msg_id;
429         print STDERR "examining $msg_id: " if $DEBUG;
430         if ($msg_id =~ /$config{email_domain}$/) {
431             print STDERR "skipping\n" if $DEBUG;
432             next;
433         }
434         if ($seen_msgids{$msg_id}) {
435             print STDERR "already seen\n" if $DEBUG;
436             next;
437         }
438         $seen_msgids{$msg_id}=1;
439         $sub->($bug_num,$record,$msg_id);
440         print STDERR "\n" if $DEBUG;
441     }
442 }
443
444
445 __END__
446
447 # Local Variables:
448 # cperl-indent-level: 4
449 # indent-tabs-mode: nil
450 # End: