]> git.donarmstrong.com Git - debbugs.git/blob - bin/debbugs-spam
add learn subcommand and examples
[debbugs.git] / bin / debbugs-spam
1 #! /usr/bin/perl
2 # debbugs-spam is part of debbugs, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2012 by Don Armstrong <don@donarmstrong.com>.
6
7
8 use warnings;
9 use strict;
10
11 use Getopt::Long qw(:config no_ignore_case);
12 use Pod::Usage;
13
14 =head1 NAME
15
16 debbugs-spam -- Scan log files for spam and populate nnn.log.spam
17
18 =head1 SYNOPSIS
19
20 debbugs-spam [options] bugnumber [[bugnumber2]..]
21
22  Options:
23   --spool-dir debbugs spool directory
24   --debug, -d debugging level (Default 0)
25   --help, -h display this help
26   --man, -m display manual
27
28 =head1 OPTIONS
29
30 =over
31
32 =item B<--spool-dir>
33
34 Debbugs spool directory; defaults to the value configured in the
35 debbugs configuration file.
36
37 =item B<--debug, -d>
38
39 Debug verbosity.
40
41 =item B<--help, -h>
42
43 Display brief useage information.
44
45 =item B<--man, -m>
46
47 Display this manual.
48
49 =back
50
51 =head1 SUBCOMMANDS
52
53 =over
54
55 =item B<auto-scan>
56
57 Automatically scan messages using spamassassin and mark messages as
58 spam which hit the threshold, and those that are highly negative as
59 ham.
60
61 =item B<score>
62
63 Output the score of all of the messages in a bug
64
65 =over
66
67 =item B<--skip-seen> Skip messages which have previously been classified
68
69 =back
70
71 =item B<mark-spam>
72
73 Mark messages as spam if there is a regex match to subject or message
74 id
75
76 =item B<mark-ham>
77
78 Mark messages as ham if there is a regex match to subject or message
79 id
80
81 =item B<learn>
82
83 Learn from messages which are ham/spam
84
85 =back
86
87
88 =head1 EXAMPLES
89
90 Start spamd:
91
92   /usr/sbin/spamd --socketpath=/home/debbugs/spamd_socket \
93       --nouser-config --cf='include /home/debbugs/.spamassassin/user_prefs' \
94       --cf='allow_user_rules 1' --allow-tell;
95
96 Then score bugs:
97
98   debbugs-spam --spamc-opts '-U' --spamc-opts '/home/debbugs/spamd_socket' \
99       score 859123;
100
101 =cut
102
103
104 use vars qw($DEBUG);
105
106 use Debbugs::Log qw(record_regex);
107 use Debbugs::Log::Spam;
108 use Debbugs::Config qw(:config);
109 use Debbugs::Command qw(:all);
110 use IPC::Open3 qw(open3);
111 use Carp;
112
113 my %options =
114     (debug   => 0,
115      help    => 0,
116      man     => 0,
117      verbose => 0,
118      quiet   => 0,
119      quick   => 0,
120      spamc   => 'spamc',
121      spamc_opts => [],
122     );
123
124 handle_main_arguments(\%options,
125                       'quick|q',
126                       'service|s',
127                       'sysconfdir|c',
128                       'spamc=s' => 0,
129                       'spamc_opts|spamc-opts=s@' => 0,
130                       'spool_dir|spool-dir=s',
131                       'debug|d+','help|h|?','man|m');
132
133 my %subcommands =
134     ('auto-scan' => {function => \&auto_spamscan,
135                      arguments => {'ham_threshold|ham-threshold=s' => 0,
136                                   },
137                      defaults => {ham_threshold => -5},
138                     },
139      'score' => {function => \&score_bug,
140                  arguments => {'skip_seen|skip-seen!' => 0
141                               },
142                 },
143      'mark-spam' => {function => \&mark_spam,
144                     },
145      'mark-ham' => {function => \&mark_ham,
146                    },
147      'learn' => {fuction => \&learn,
148                 },
149      'help' => {function => sub {pod2usage({verbose => 2});}}
150     );
151
152 pod2usage() if $options{help};
153 pod2usage({verbose=>2}) if $options{man};
154
155 $DEBUG = $options{debug};
156
157 my @USAGE_ERRORS;
158 $options{verbose} = $options{verbose} - $options{quiet};
159
160 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
161
162 my ($subcommand) = shift @ARGV;
163 if (not defined $subcommand) {
164     $subcommand = 'help';
165     print STDERR "You must provide a subcommand; displaying usage.\n";
166     pod2usage();
167 } elsif (not exists $subcommands{$subcommand}) {
168     print STDERR "$subcommand is not a valid subcommand; displaying usage.\n";
169     pod2usage();
170 }
171
172 if (exists $options{spool_dir} and defined $options{spool_dir}) {
173     $config{spool_dir} = $options{spool_dir};
174 }
175 if ($subcommand ne 'help') {
176     chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
177 }
178 my $opts =
179     handle_subcommand_arguments(\@ARGV,
180                                 $subcommands{$subcommand}{arguments},
181                                 $subcommands{$subcommand}{defaults},
182                                );
183 $subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV);
184
185
186 sub mark_ham {
187     mark_it('ham',@_);
188 }
189
190 sub mark_spam {
191     mark_it('spam',@_);
192 }
193
194 sub mark_it {
195     my ($spam_ham,$options,$opts,$config,$argv) = @_;
196     my $regex = shift @{$argv};
197     for my $bug_num (@{$argv}) {
198         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
199             die "Unable to open bug log spam for $bug_num";
200         foreachmsg(sub {
201                        my ($bn,$rec,$mid) = @_;
202                        my $body = $rec->{text};
203                        my ($subject) = $body =~ /^Subject: *(.+)$/mi;
204                        my $is_match = 0;
205                        if ($subject =~ /\Q$regex\E/) {
206                            $is_match = 1;
207                        }
208                        if ($mid =~ /\Q$regex\E/) {
209                            $is_match = 1;
210                        }
211                        if ($is_match) {
212                            print STDERR "it's a match" if $DEBUG;
213                            if ($spam_ham eq 'spam') {
214                                $spam->add_spam($mid);
215                            } else {
216                                $spam->add_ham($mid);
217                            }
218                        }
219                    },
220                    $bug_num
221                   );
222         $spam->save();
223     }
224 }
225
226 sub learn {
227     my ($options,$opts,$config,$argv) = @_;
228     for my $bug_num (@{$argv}) {
229         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
230             die "Unable to open bug log spam for $bug_num";
231         foreachmsg(sub {
232                        my ($bn,$rec,$mid) = @_;
233                        my $score;
234                        if ($spam->is_spam($mid)) {
235                            $score //=
236                                spam_score($rec,$options->{spamc},
237                                           [@{$options->{spamc_opts}},
238                                            '-L','spam'
239                                           ]
240                                          );
241                            print STDERR "learning spam" if $DEBUG;
242                        } elsif ($spam->is_ham($mid)) {
243                            $score //=
244                                spam_score($rec,$options->{spamc},
245                                           [@{$options->{spamc_opts}},
246                                            '-L','ham'
247                                           ]
248                                          );
249                            print STDERR "learning ham" if $DEBUG;
250                        } else {
251                            print STDERR "not learning" if $DEBUG;
252                        }
253                        print STDERR " from $mid" if $DEBUG;
254                    },
255                    $bug_num
256                   );
257     }
258 }
259
260
261 sub score_bug {
262     my ($options,$opts,$config,$argv) = @_;
263     for my $bug_num (@{$argv}) {
264         my @bug_score =
265             spam_score_bug($bug_num,
266                            $options->{spamc},
267                            $options->{spamc_opts},
268                            $opts->{skip_seen},
269                           );
270         print "$_->{score} $_->{message_id} $_->{subject}\n"
271             foreach @bug_score;
272     }
273 }
274
275 sub auto_spamscan {
276     my ($options,$opts,$config,$argv) = @_;
277
278     for my $bug_num (@{$argv}) {
279         my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
280             die "Unable to open bug log spam for $bug_num";
281         foreachmsg(sub {
282                        my ($bn,$rec,$mid) = @_;
283                        if ($spam->is_spam($mid)) {
284                            print STDERR "already spam\n" if $DEBUG;
285                            return;
286                        }
287                        if ($spam->is_ham($mid)) {
288                            print STDERR "already ham\n" if $DEBUG;
289                            return;
290                        }
291                        my ($score,$is_spam,$report,$threshold) =
292                            spam_score($rec,
293                                       $options->{spamc},
294                                       $options->{spamc_opts},
295                                      );
296                        if ($is_spam) {
297                            print STDERR "it's spam ($score)\n" if $DEBUG;
298                            $spam->add_spam($mid);
299                        } elsif ($score < $opts->{ham_threshold}) {
300                            print STDERR "it's really ham ($score)\n" if $DEBUG;
301                            $spam->add_ham($mid);
302                        }
303                        else {
304                            print STDERR "it's ham ($score)\n" if $DEBUG;
305                        }
306                    },
307                    $bug_num,
308                   );
309         $spam->save();
310     }
311 }
312
313 sub spam_score_bug {
314     my ($bug,$spamc,$spamc_opts,$skip_seen) = @_;
315
316     my $spam;
317     if ($skip_seen) {
318         $spam = Debbugs::Log::Spam->new(bug_num => $bug) or
319             die "Unable to open bug log spam for $bug";
320     }
321     my @records;
322     foreachmsg(sub {
323                    my ($bn,$rec,$mid) = @_;
324                    my $score;
325                    if ($skip_seen) {
326                        if ($spam->is_spam($mid)) {
327                            $score = 999;
328                        } elsif ($spam->is_ham($mid)) {
329                            $score = -999;
330                        }
331                    }
332                    $score //=
333                        spam_score($rec,$spamc,$spamc_opts);
334                    my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi;
335                    push @records,
336                       {message_id => $mid,
337                        score => $score,
338                        subject => $subject,
339                       };
340                },
341                $bug
342               );
343     return @records;
344 }
345
346 sub spam_score {
347     my ($record,$spamc,$spamc_opts) = @_;
348     my ($score,$threshold,$report);
349     my $is_spam = 0;
350     eval {
351         my ($spamc_in,$spamc_out);
352         my $old_sig = $SIG{"PIPE"};
353         $SIG{"PIPE"} = sub {
354             die "SIGPIPE in child for some reason";
355         };
356         my $childpid =
357             open3($spamc_in,$spamc_out,0,
358                   $spamc,'-E',@{$spamc_opts}) or
359                       die "Unable to fork spamc: $!";
360         if (not $childpid) {
361             die "Unable to fork spamc";
362         }
363         print {$spamc_in} $record->{text};
364         close($spamc_in) or die "Unable to close spamc_in: $!";
365         waitpid($childpid,0);
366         if ($? >> 8) {
367             $is_spam = 1;
368         }
369         my ($first_line,@report) = <$spamc_out>;
370         if ($DEBUG) {
371             print STDERR "[$?;".($? >> 8)."] ";
372             print STDERR $first_line,@report;
373             print STDERR " ";
374         }
375         if (defined $first_line) {
376             chomp $first_line;
377             ($score,$threshold) = $first_line =~ m{^(-?[\d\.]+)/(-?[\d\.]+)$};
378             $report = join('',@report);
379         }
380         close($spamc_out);
381         $SIG{"PIPE"} = $old_sig;
382     };
383     if ($@) {
384         carp "processing of message failed [$@]\n";
385         return undef;
386     }
387     return wantarray?($score,$is_spam,$report):$score;
388 }
389
390 sub foreachmsg {
391     my ($sub,$bug_num) = @_;
392     my $log = Debbugs::Log->new(bug_num => $bug_num) or
393         die "Unable to open bug log for $bug_num";
394     my %seen_msgids;
395     while (my $record = $log->read_record()) {
396         next if $record->{type} eq 'html';
397         next if $record->{type} eq 'autocheck';
398         my ($msg_id) = record_regex($record,
399                                     qr/^Message-Id:\s+<(.+)>/mi);
400         next unless defined $msg_id;
401         print STDERR "examining $msg_id: " if $DEBUG;
402         if ($msg_id =~ /$config{email_domain}$/) {
403             print STDERR "skipping\n" if $DEBUG;
404             next;
405         }
406         if ($seen_msgids{$msg_id}) {
407             print STDERR "already seen\n" if $DEBUG;
408             next;
409         }
410         $seen_msgids{$msg_id}=1;
411         $sub->($bug_num,$record,$msg_id);
412         print STDERR "\n" if $DEBUG;
413     }
414 }
415
416
417 __END__
418
419 # Local Variables:
420 # cperl-indent-level: 4
421 # indent-tabs-mode: nil
422 # End: