]> git.donarmstrong.com Git - debbugs.git/blob - bin/debbugs-spam
09077e0acccac2a0c842caf0df07b6cea79ab520
[debbugs.git] / bin / debbugs-spam
1 #! /usr/bin/perl
2 # debbugs-spam is part of debbugs, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2012 by Don Armstrong <don@donarmstrong.com>.
6
7
8 use warnings;
9 use strict;
10
11 use Getopt::Long qw(:config no_ignore_case);
12 use Pod::Usage;
13
14 =head1 NAME
15
16 debbugs-spam -- Scan log files for spam and populate nnn.log.spam
17
18 =head1 SYNOPSIS
19
20 debbugs-spam [options] bugnumber [[bugnumber2]..]
21
22  Options:
23   --spool-dir debbugs spool directory
24   --debug, -d debugging level (Default 0)
25   --help, -h display this help
26   --man, -m display manual
27
28 =head1 OPTIONS
29
30 =over
31
32 =item B<--spool-dir>
33
34 Debbugs spool directory; defaults to the value configured in the
35 debbugs configuration file.
36
37 =item B<--debug, -d>
38
39 Debug verbosity.
40
41 =item B<--help, -h>
42
43 Display brief useage information.
44
45 =item B<--man, -m>
46
47 Display this manual.
48
49 =back
50
51 =head1 SUBCOMMANDS
52
53 =over
54
55 =item B<auto-scan>
56
57 Automatically scan messages using spamassassin and mark messages as
58 spam which hit the threshold, and those that are highly negative as
59 ham.
60
61 =item B<score>
62
63 Output the score of all of the messages in a bug
64
65 =over
66
67 =item B<--skip-seen> Skip messages which have previously been classified
68
69 =back
70
71 =item B<mark-spam>
72
73 Mark messages as spam if there is a regex match to subject or message
74 id
75
76 =item B<mark-ham>
77
78 Mark messages as ham if there is a regex match to subject or message
79 id
80
81 =item B<learn>
82
83 Learn from messages which are ham/spam
84
85 =back
86
87
88 =head1 EXAMPLES
89
90 Start spamd:
91
92   /usr/sbin/spamd --socketpath=/home/debbugs/spamd_socket \
93       --nouser-config --cf='include /home/debbugs/.spamassassin/user_prefs' \
94       --cf='allow_user_rules 1' --allow-tell;
95
96 Then score bugs:
97
98   debbugs-spam --spamc-opts '-U' --spamc-opts '/home/debbugs/spamd_socket' \
99       score 859123;
100
101 =cut
102
103
104 use vars qw($DEBUG);
105
106 use Debbugs::Log qw(record_regex);
107 use Debbugs::Log::Spam;
108 use Debbugs::Config qw(:config);
109 use Debbugs::Command qw(:all);
110 use IPC::Open3 qw(open3);
111 use Carp;
112
113 my %options =
114     (debug   => 0,
115      help    => 0,
116      man     => 0,
117      verbose => 0,
118      quiet   => 0,
119      quick   => 0,
120      spamc   => 'spamc',
121      spamc_opts => [],
122     );
123
124 handle_main_arguments(\%options,
125                       'quick|q',
126                       'service|s',
127                       'sysconfdir|c',
128                       'spamc=s' => 0,
129                       'spamc_opts|spamc-opts=s@' => 0,
130                       'spool_dir|spool-dir=s',
131                       'debug|d+','help|h|?','man|m');
132
133 my %subcommands =
134     ('auto-scan' => {function => \&auto_spamscan,
135                      arguments => {'ham_threshold|ham-threshold=s' => 0,
136                                   },
137                      defaults => {ham_threshold => -5},
138                     },
139      'score' => {function => \&score_bug,
140                  arguments => {'skip_seen|skip-seen!' => 0
141                               },
142                 },
143      'mark-spam' => {function => \&mark_spam,
144                     },
145      'mark-ham' => {function => \&mark_ham,
146                    },
147      'learn' => {function => \&learn,
148                 },
149      'help' => {function => sub {pod2usage({verbose => 2});}}
150     );
151
152 pod2usage() if $options{help};
153 pod2usage({verbose=>2}) if $options{man};
154
155 $DEBUG = $options{debug};
156
157 my @USAGE_ERRORS;
158 $options{verbose} = $options{verbose} - $options{quiet};
159
160 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
161
162 my ($subcommand) = shift @ARGV;
163 if (not defined $subcommand) {
164     $subcommand = 'help';
165     print STDERR "You must provide a subcommand; displaying usage.\n";
166     pod2usage();
167 } elsif (not exists $subcommands{$subcommand}) {
168     print STDERR "$subcommand is not a valid subcommand; displaying usage.\n";
169     pod2usage();
170 }
171
172 if (exists $options{spool_dir} and defined $options{spool_dir}) {
173     $config{spool_dir} = $options{spool_dir};
174 }
175 if ($subcommand ne 'help') {
176     chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
177 }
178 my $opts =
179     handle_subcommand_arguments(\@ARGV,
180                                 $subcommands{$subcommand}{arguments},
181                                 $subcommands{$subcommand}{defaults},
182                                );
183 $subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV);
184
185
186 sub mark_ham {
187     mark_it('ham',@_);
188 }
189
190 sub mark_spam {
191     mark_it('spam',@_);
192 }
193
194 sub mark_it {
195     my ($spam_ham,$options,$opts,$config,$argv) = @_;
196     my $regex = shift @{$argv};
197     for my $bug_num (@{$argv}) {
198         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
199             die "Unable to open bug log spam for $bug_num";
200         foreachmsg(sub {
201                        my ($bn,$rec,$mid) = @_;
202                        my $body = $rec->{text};
203                        my ($subject) = $body =~ /^Subject: *(.+)$/mi;
204                        my $is_match = 0;
205                        if ($subject =~ /$regex/) {
206                            $is_match = 1;
207                        }
208                        if ($mid =~ /$regex/) {
209                            $is_match = 1;
210                        }
211                        if ($is_match) {
212                            print STDERR "it's a match" if $DEBUG;
213                            if ($spam_ham eq 'spam') {
214                                $spam->add_spam($mid);
215                            } else {
216                                $spam->add_ham($mid);
217                            }
218                        }
219                    },
220                    $bug_num
221                   );
222         $spam->save();
223     }
224 }
225
226 sub learn {
227     my ($options,$opts,$config,$argv) = @_;
228     for my $bug_num (@{$argv}) {
229         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
230             die "Unable to open bug log spam for $bug_num";
231         foreachmsg(sub {
232                        my ($bn,$rec,$mid) = @_;
233                        my $score;
234                        if ($spam->is_spam($mid)) {
235                            $score //=
236                                spam_score($rec,$options->{spamc},
237                                           [@{$options->{spamc_opts}},
238                                            '-L','spam'
239                                           ]
240                                          );
241                            print STDERR "learning spam" if $DEBUG;
242                        } elsif ($spam->is_ham($mid)) {
243                            $score //=
244                                spam_score($rec,$options->{spamc},
245                                           [@{$options->{spamc_opts}},
246                                            '-L','ham'
247                                           ]
248                                          );
249                            print STDERR "learning ham" if $DEBUG;
250                        } else {
251                            print STDERR "not learning" if $DEBUG;
252                        }
253                        print STDERR " from $mid" if $DEBUG;
254                    },
255                    $bug_num
256                   );
257     }
258 }
259
260
261 sub score_bug {
262     my ($options,$opts,$config,$argv) = @_;
263     for my $bug_num (@{$argv}) {
264         my @bug_score =
265             spam_score_bug($bug_num,
266                            $options->{spamc},
267                            $options->{spamc_opts},
268                            $opts->{skip_seen},
269                           );
270         print "$_->{score} $_->{message_id} $_->{subject}\n"
271             foreach @bug_score;
272     }
273 }
274
275 sub auto_spamscan {
276     my ($options,$opts,$config,$argv) = @_;
277
278     for my $bug_num (@{$argv}) {
279         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
280             die "Unable to open bug log spam for $bug_num";
281         foreachmsg(sub {
282                        my ($bn,$rec,$mid) = @_;
283                        if ($spam->is_spam($mid)) {
284                            print STDERR "already spam\n" if $DEBUG;
285                            return;
286                        }
287                        if ($spam->is_ham($mid)) {
288                            print STDERR "already ham\n" if $DEBUG;
289                            return;
290                        }
291                        my ($score,$is_spam,$report,$threshold) =
292                            spam_score($rec,
293                                       $options->{spamc},
294                                       $options->{spamc_opts},
295                                      );
296                        if ($is_spam) {
297                            print STDERR "it's spam ($score)\n" if $DEBUG;
298                            $spam->add_spam($mid);
299                        } elsif ($score < $opts->{ham_threshold}) {
300                            print STDERR "it's really ham ($score)\n" if $DEBUG;
301                            $spam->add_ham($mid);
302                        }
303                        else {
304                            print STDERR "it's ham ($score)\n" if $DEBUG;
305                        }
306                    },
307                    $bug_num,
308                   );
309         $spam->save();
310     }
311 }
312
313 sub spam_score_bug {
314     my ($bug,$spamc,$spamc_opts,$skip_seen) = @_;
315
316     my $spam;
317     if ($skip_seen) {
318         $spam = Debbugs::Log::Spam->new(bug_num => $bug) or
319             die "Unable to open bug log spam for $bug";
320     }
321     my @records;
322     foreachmsg(sub {
323                    my ($bn,$rec,$mid) = @_;
324                    my $score;
325                    if ($skip_seen) {
326                        if ($spam->is_spam($mid)) {
327                            $score = 999;
328                        } elsif ($spam->is_ham($mid)) {
329                            $score = -999;
330                        }
331                    }
332                    $score //=
333                        spam_score($rec,$spamc,$spamc_opts);
334                    my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi;
335                    push @records,
336                       {message_id => $mid,
337                        score => $score,
338                        subject => $subject,
339                       };
340                },
341                $bug
342               );
343     return @records;
344 }
345
346 sub spam_score {
347     my ($record,$spamc,$spamc_opts) = @_;
348     my ($score,$threshold,$report);
349     my $is_spam = 0;
350     eval {
351         $report = '';
352         $score = 0;
353         $threshold = 5;
354         my ($spamc_in,$spamc_out);
355         my $old_sig = $SIG{"PIPE"};
356         $SIG{"PIPE"} = sub {
357             die "SIGPIPE in child for some reason";
358         };
359         my $childpid =
360             open3($spamc_in,$spamc_out,0,
361                   $spamc,'-E','--headers',@{$spamc_opts}) or
362                       die "Unable to fork spamc: $!";
363         if (not $childpid) {
364             die "Unable to fork spamc";
365         }
366         print {$spamc_in} $record->{text};
367         close($spamc_in) or die "Unable to close spamc_in: $!";
368         waitpid($childpid,0);
369         my $exit_code = $? >> 8;
370         if ($exit_code) {
371             $is_spam = 1;
372         }
373         while (<$spamc_out>) {
374             if (/^X-Spam/) {
375                 $report .= $_;
376                 if (/^X-Spam-Status: (Yes|No), score=(-?[\d\.]+) required=(-?[\d\.]+)/) {
377                     $threshold = $3;
378                     $score = $2;
379                 }
380             }
381             if (/^\s*$/) {
382                 last;
383             }
384         }
385         if ($DEBUG) {
386             print STDERR "[$exit_code] [$score/$threshold]\n$report\n";
387         }
388         close($spamc_out);
389         $SIG{"PIPE"} = $old_sig;
390     };
391     if ($@) {
392         carp "processing of message failed [$@]\n";
393         return undef;
394     }
395     return wantarray?($score,$is_spam,$report):$score;
396 }
397
398 sub foreachmsg {
399     my ($sub,$bug_num) = @_;
400     my $log = Debbugs::Log->new(bug_num => $bug_num) or
401         die "Unable to open bug log for $bug_num";
402     my %seen_msgids;
403     while (my $record = $log->read_record()) {
404         next if $record->{type} eq 'html';
405         next if $record->{type} eq 'autocheck';
406         my ($msg_id) = record_regex($record,
407                                     qr/^Message-Id:\s+<(.+)>/mi);
408         next unless defined $msg_id;
409         print STDERR "examining $msg_id: " if $DEBUG;
410         if ($msg_id =~ /$config{email_domain}$/) {
411             print STDERR "skipping\n" if $DEBUG;
412             next;
413         }
414         if ($seen_msgids{$msg_id}) {
415             print STDERR "already seen\n" if $DEBUG;
416             next;
417         }
418         $seen_msgids{$msg_id}=1;
419         $sub->($bug_num,$record,$msg_id);
420         print STDERR "\n" if $DEBUG;
421     }
422 }
423
424
425 __END__
426
427 # Local Variables:
428 # cperl-indent-level: 4
429 # indent-tabs-mode: nil
430 # End: