Filename | /usr/local/bin/sa-learn |
Statements | Executed 4827 statements in 65.7ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
234 | 1 | 1 | 31.1ms | 715s | wanted | main::
1 | 1 | 1 | 20.7ms | 39.0ms | BEGIN@24 | main::
1 | 1 | 1 | 18.7ms | 670ms | BEGIN@65 | main::
3936 | 2 | 2 | 17.0ms | 17.0ms | is_utf8 (xsub) | utf8::
3017 | 12 | 9 | 16.7ms | 16.7ms | can (xsub) | UNIVERSAL::
1 | 1 | 1 | 11.8ms | 17.4ms | BEGIN@66 | main::
1038 | 1 | 1 | 9.15ms | 9.15ms | decode (xsub) | Encode::XS::
1 | 1 | 1 | 6.25ms | 6.30ms | BEGIN@20 | main::
1 | 1 | 1 | 5.67ms | 105ms | BEGIN@25 | main::
234 | 1 | 1 | 4.17ms | 4.18ms | result | main::
1 | 1 | 1 | 3.38ms | 10.1ms | BEGIN@23 | main::
1 | 1 | 1 | 2.65ms | 8.03ms | BEGIN@69 | main::
1 | 1 | 1 | 1.45ms | 4.49ms | BEGIN@68 | main::
1 | 1 | 1 | 1.33ms | 1.87ms | BEGIN@39 | main::
1 | 1 | 1 | 1.15ms | 1.33ms | BEGIN@19 | main::
148 | 3 | 1 | 970µs | 970µs | SvREADONLY (xsub) | Internals::
1 | 1 | 1 | 612µs | 628µs | BEGIN@21 | main::
147 | 1 | 1 | 569µs | 569µs | method_changed_in (xsub) | mro::
57 | 4 | 4 | 280µs | 280µs | isa (xsub) | UNIVERSAL::
24 | 23 | 10 | 142µs | 142µs | CORE:pack (opcode) | main::
6 | 6 | 5 | 127µs | 127µs | VERSION (xsub) | UNIVERSAL::
26 | 3 | 3 | 109µs | 109µs | encode (xsub) | utf8::
2 | 1 | 1 | 56µs | 56µs | target | main::
1 | 1 | 1 | 56µs | 170µs | BEGIN@41 | main::
2 | 1 | 1 | 47µs | 47µs | CORE:ftis (opcode) | main::
1 | 1 | 1 | 32µs | 32µs | CORE:print (opcode) | main::
4 | 2 | 1 | 24µs | 24µs | CORE:match (opcode) | main::
1 | 1 | 1 | 24µs | 212µs | BEGIN@70 | main::
1 | 1 | 1 | 22µs | 586µs | BEGIN@28 | main::
1 | 1 | 1 | 21µs | 21µs | BEGIN@67 | main::
1 | 1 | 1 | 20µs | 20µs | BEGIN@26 | main::
2 | 2 | 1 | 12µs | 12µs | CORE:close (opcode) | main::
1 | 1 | 1 | 10µs | 10µs | init_results | main::
1 | 1 | 1 | 5µs | 5µs | __ANON__[:94] | main::
0 | 0 | 0 | 0s | 0s | RUNTIME | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:112] | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:130] | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:131] | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:132] | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:133] | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:134] | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:93] | main::
0 | 0 | 0 | 0s | 0s | __ANON__[:96] | main::
0 | 0 | 0 | 0s | 0s | killed | main::
0 | 0 | 0 | 0s | 0s | usage | main::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
0 | 1 | 74µs | Profile data that couldn't be associated with a specific line: # spent 74µs making 1 call to Mail::SpamAssassin::Logger::END | ||
1 | #!/usr/local/bin/perl -T -w | ||||
2 | # <@LICENSE> | ||||
3 | # Licensed to the Apache Software Foundation (ASF) under one or more | ||||
4 | # contributor license agreements. See the NOTICE file distributed with | ||||
5 | # this work for additional information regarding copyright ownership. | ||||
6 | # The ASF licenses this file to you under the Apache License, Version 2.0 | ||||
7 | # (the "License"); you may not use this file except in compliance with | ||||
8 | # the License. You may obtain a copy of the License at: | ||||
9 | # | ||||
10 | # http://www.apache.org/licenses/LICENSE-2.0 | ||||
11 | # | ||||
12 | # Unless required by applicable law or agreed to in writing, software | ||||
13 | # distributed under the License is distributed on an "AS IS" BASIS, | ||||
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
15 | # See the License for the specific language governing permissions and | ||||
16 | # limitations under the License. | ||||
17 | # </@LICENSE> | ||||
18 | |||||
19 | 2 | 714µs | 2 | 1.35ms | # spent 1.33ms (1.15+186µs) within main::BEGIN@19 which was called:
# once (1.15ms+186µs) by main::NULL at line 19 # spent 1.33ms making 1 call to main::BEGIN@19
# spent 12µs making 1 call to strict::import |
20 | 2 | 6.02ms | 2 | 6.34ms | # spent 6.30ms (6.25+53µs) within main::BEGIN@20 which was called:
# once (6.25ms+53µs) by main::NULL at line 20 # spent 6.30ms making 1 call to main::BEGIN@20
# spent 42µs making 1 call to warnings::import |
21 | 2 | 622µs | 2 | 644µs | # spent 628µs (612+16) within main::BEGIN@21 which was called:
# once (612µs+16µs) by main::NULL at line 21 # spent 628µs making 1 call to main::BEGIN@21
# spent 16µs making 1 call to bytes::import |
22 | |||||
23 | 2 | 318µs | 2 | 11.5ms | # spent 10.1ms (3.38+6.74) within main::BEGIN@23 which was called:
# once (3.38ms+6.74ms) by main::NULL at line 23 # spent 10.1ms making 1 call to main::BEGIN@23
# spent 1.37ms making 1 call to Exporter::import |
24 | 2 | 368µs | 2 | 43.1ms | # spent 39.0ms (20.7+18.3) within main::BEGIN@24 which was called:
# once (20.7ms+18.3ms) by main::NULL at line 24 # spent 39.0ms making 1 call to main::BEGIN@24
# spent 4.08ms making 1 call to Getopt::Long::import |
25 | 2 | 389µs | 2 | 105ms | # spent 105ms (5.67+98.9) within main::BEGIN@25 which was called:
# once (5.67ms+98.9ms) by main::NULL at line 25 # spent 105ms making 1 call to main::BEGIN@25
# spent 352µs making 1 call to Exporter::import |
26 | 2 | 89µs | 1 | 20µs | # spent 20µs within main::BEGIN@26 which was called:
# once (20µs+0s) by main::NULL at line 26 # spent 20µs making 1 call to main::BEGIN@26 |
27 | |||||
28 | 1 | 2µs | # spent 586µs (22+564) within main::BEGIN@28 which was called:
# once (22µs+564µs) by main::NULL at line 33 | ||
29 | $spamtest %opt $isspam $forget | ||||
30 | $messagecount $learnedcount $messagelimit | ||||
31 | $progress $total_messages $init_results $start_time | ||||
32 | $synconly $learnprob @targets $bayes_override_path | ||||
33 | 1 | 97µs | 2 | 1.15ms | ); # spent 586µs making 1 call to main::BEGIN@28
# spent 564µs making 1 call to vars::import |
34 | |||||
35 | 1 | 4µs | my $PREFIX = '/usr/local'; # substituted at 'make' time | ||
36 | 1 | 2µs | my $DEF_RULES_DIR = '/usr/local/share/spamassassin'; # substituted at 'make' time | ||
37 | 1 | 2µs | my $LOCAL_RULES_DIR = '/usr/local/etc/mail/spamassassin'; # substituted at 'make' time | ||
38 | |||||
39 | 2 | 620µs | 2 | 2.26ms | # spent 1.87ms (1.33+546µs) within main::BEGIN@39 which was called:
# once (1.33ms+546µs) by main::NULL at line 39 # spent 1.87ms making 1 call to main::BEGIN@39
# spent 386µs making 1 call to lib::import |
40 | |||||
41 | # spent 170µs (56+115) within main::BEGIN@41 which was called:
# once (56µs+115µs) by main::NULL at line 63 | ||||
42 | 1 | 21µs | 1 | 68µs | my @bin = File::Spec->splitpath($0); # spent 68µs making 1 call to File::Spec::Unix::splitpath |
43 | 1 | 2µs | my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1]) | ||
44 | || File::Spec->curdir; | ||||
45 | |||||
46 | 1 | 72µs | 2 | 47µs | if (-e $bin.'/lib/Mail/SpamAssassin.pm' # spent 47µs making 2 calls to main::CORE:ftis, avg 24µs/call |
47 | || !-e '/usr/local/lib/perl5/site_perl/Mail/SpamAssassin.pm' ) | ||||
48 | { | ||||
49 | my $searchrelative; | ||||
50 | if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm') | ||||
51 | { | ||||
52 | unshift ( @INC, '../blib/lib' ); | ||||
53 | } else { | ||||
54 | foreach ( qw(lib ../lib/site_perl | ||||
55 | ../lib/spamassassin ../share/spamassassin/lib)) | ||||
56 | { | ||||
57 | my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) ); | ||||
58 | if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) ) | ||||
59 | { unshift ( @INC, $dir ); last; } | ||||
60 | } | ||||
61 | } | ||||
62 | } | ||||
63 | 1 | 75µs | 1 | 170µs | } # spent 170µs making 1 call to main::BEGIN@41 |
64 | |||||
65 | 2 | 433µs | 1 | 670ms | # spent 670ms (18.7+651) within main::BEGIN@65 which was called:
# once (18.7ms+651ms) by main::NULL at line 65 # spent 670ms making 1 call to main::BEGIN@65 |
66 | 2 | 357µs | 1 | 17.4ms | # spent 17.4ms (11.8+5.62) within main::BEGIN@66 which was called:
# once (11.8ms+5.62ms) by main::NULL at line 66 # spent 17.4ms making 1 call to main::BEGIN@66 |
67 | 2 | 62µs | 1 | 21µs | # spent 21µs within main::BEGIN@67 which was called:
# once (21µs+0s) by main::NULL at line 67 # spent 21µs making 1 call to main::BEGIN@67 |
68 | 2 | 390µs | 1 | 4.49ms | # spent 4.49ms (1.45+3.03) within main::BEGIN@68 which was called:
# once (1.45ms+3.03ms) by main::NULL at line 68 # spent 4.49ms making 1 call to main::BEGIN@68 |
69 | 2 | 362µs | 1 | 8.03ms | # spent 8.03ms (2.65+5.38) within main::BEGIN@69 which was called:
# once (2.65ms+5.38ms) by main::NULL at line 69 # spent 8.03ms making 1 call to main::BEGIN@69 |
70 | 2 | 9.90ms | 2 | 400µs | # spent 212µs (24+188) within main::BEGIN@70 which was called:
# once (24µs+188µs) by main::NULL at line 70 # spent 212µs making 1 call to main::BEGIN@70
# spent 188µs making 1 call to Exporter::import |
71 | |||||
72 | ########################################################################### | ||||
73 | |||||
74 | 1 | 79µs | $SIG{PIPE} = 'IGNORE'; | ||
75 | |||||
76 | # used to be CmdLearn::cmd_run() ... | ||||
77 | |||||
78 | 1 | 11µs | %opt = ( | ||
79 | 'force-expire' => 0, | ||||
80 | 'use-ignores' => 0, | ||||
81 | 'nosync' => 0, | ||||
82 | 'quiet' => 0, | ||||
83 | 'cf' => [] | ||||
84 | ); | ||||
85 | |||||
86 | 1 | 16µs | 1 | 318µs | Getopt::Long::Configure( # spent 318µs making 1 call to Getopt::Long::Configure |
87 | qw(bundling no_getopt_compat | ||||
88 | permute no_auto_abbrev no_ignore_case) | ||||
89 | ); | ||||
90 | |||||
91 | GetOptions( | ||||
92 | 'forget' => \$forget, | ||||
93 | 'ham|nonspam' => sub { $isspam = 0; }, | ||||
94 | 1 | 9µs | # spent 5µs within main::__ANON__[/usr/local/bin/sa-learn:94] which was called:
# once (5µs+0s) by Getopt::Long::GetOptionsFromArray at line 605 of Getopt/Long.pm | ||
95 | 'sync' => \$synconly, | ||||
96 | 'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" }, | ||||
97 | |||||
98 | 'q|quiet' => \$opt{'quiet'}, | ||||
99 | 'username|u=s' => \$opt{'username'}, | ||||
100 | 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'}, | ||||
101 | 'prefspath|prefs-file|p=s' => \$opt{'prefspath'}, | ||||
102 | 'siteconfigpath=s' => \$opt{'siteconfigpath'}, | ||||
103 | 1 | 4µs | 'cf=s' => \@{$opt{'cf'}}, | ||
104 | |||||
105 | 'folders|f=s' => \$opt{'folders'}, | ||||
106 | 'force-expire|expire' => \$opt{'force-expire'}, | ||||
107 | 'local|L' => \$opt{'local'}, | ||||
108 | 'no-sync|nosync' => \$opt{'nosync'}, | ||||
109 | 'showdots' => \$opt{'showdots'}, | ||||
110 | 'progress' => \$opt{'progress'}, | ||||
111 | 'use-ignores' => \$opt{'use-ignores'}, | ||||
112 | 'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" }, | ||||
113 | |||||
114 | 'learnprob=f' => \$opt{'learnprob'}, | ||||
115 | 'randseed=i' => \$opt{'randseed'}, | ||||
116 | 'stopafter=i' => \$opt{'stopafter'}, | ||||
117 | 'max-size=i' => \$opt{'max-size'}, | ||||
118 | |||||
119 | 'debug|debug-level|D:s' => \$opt{'debug'}, | ||||
120 | 'help|h|?' => \$opt{'help'}, | ||||
121 | 'version|V' => \$opt{'version'}, | ||||
122 | |||||
123 | 'dump:s' => \$opt{'dump'}, | ||||
124 | 'import' => \$opt{'import'}, | ||||
125 | |||||
126 | 'backup' => \$opt{'backup'}, | ||||
127 | 'clear' => \$opt{'clear'}, | ||||
128 | 'restore=s' => \$opt{'restore'}, | ||||
129 | |||||
130 | 'dir' => sub { $opt{'old_format'} = 'dir'; }, | ||||
131 | 'file' => sub { $opt{'old_format'} = 'file'; }, | ||||
132 | 'mbox' => sub { $opt{'format'} = 'mbox'; }, | ||||
133 | 'mbx' => sub { $opt{'format'} = 'mbx'; }, | ||||
134 | 'single' => sub { $opt{'old_format'} = 'single'; }, | ||||
135 | |||||
136 | 'db|dbpath=s' => \$bayes_override_path, | ||||
137 | 1 | 83µs | 1 | 29µs | 're|regexp=s' => \$opt{'regexp'}, # spent 29µs making 1 call to Getopt::Long::GetOptions |
138 | |||||
139 | '<>' => \&target, | ||||
140 | ) | ||||
141 | or usage( 0, "Unknown option!" ); | ||||
142 | |||||
143 | 1 | 3µs | if ( defined $opt{'help'} ) { | ||
144 | usage( 0, "For more information read the manual page" ); | ||||
145 | } | ||||
146 | 1 | 2µs | if ( defined $opt{'version'} ) { | ||
147 | print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n"; | ||||
148 | exit 0; | ||||
149 | } | ||||
150 | |||||
151 | # set debug areas, if any specified (only useful for command-line tools) | ||||
152 | 1 | 2µs | if (defined $opt{'debug'}) { | ||
153 | $opt{'debug'} ||= 'all'; | ||||
154 | } | ||||
155 | |||||
156 | 1 | 2µs | if ( $opt{'force-expire'} ) { | ||
157 | $synconly = 1; | ||||
158 | } | ||||
159 | |||||
160 | 1 | 2µs | if ($opt{'showdots'} && $opt{'progress'}) { | ||
161 | print "--showdots and --progress may not be used together, please select just one\n"; | ||||
162 | exit 0; | ||||
163 | } | ||||
164 | |||||
165 | 1 | 2µs | if ( !defined $isspam | ||
166 | && !defined $synconly | ||||
167 | && !defined $forget | ||||
168 | && !defined $opt{'dump'} | ||||
169 | && !defined $opt{'import'} | ||||
170 | && !defined $opt{'clear'} | ||||
171 | && !defined $opt{'backup'} | ||||
172 | && !defined $opt{'restore'} | ||||
173 | && !defined $opt{'folders'} ) | ||||
174 | { | ||||
175 | usage( 0, | ||||
176 | "Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore" | ||||
177 | ); | ||||
178 | } | ||||
179 | |||||
180 | # We need to make sure the journal syncs pre-forget... | ||||
181 | 1 | 2µs | if ( defined $forget && $opt{'nosync'} ) { | ||
182 | $opt{'nosync'} = 0; | ||||
183 | warn | ||||
184 | "sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n"; | ||||
185 | } | ||||
186 | |||||
187 | 1 | 2µs | if ( defined $opt{'old_format'} ) { | ||
188 | |||||
189 | #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single. | ||||
190 | #Convert it to the new behavior: | ||||
191 | if ( $opt{'old_format'} eq 'single' ) { | ||||
192 | push ( @ARGV, '-' ); | ||||
193 | } | ||||
194 | } | ||||
195 | |||||
196 | 1 | 3µs | my $post_config = ''; | ||
197 | |||||
198 | # kluge to support old check_bayes_db operation | ||||
199 | # bug 3799: init() will go r/o with the configured DB, and then dbpath needs | ||||
200 | # to override. Just access the dbpath version via post_config_text. | ||||
201 | 1 | 2µs | if ( defined $bayes_override_path ) { | ||
202 | # Add a default prefix if the path is a directory | ||||
203 | if ( -d $bayes_override_path ) { | ||||
204 | $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' ); | ||||
205 | } | ||||
206 | |||||
207 | $post_config .= "bayes_path $bayes_override_path\n"; | ||||
208 | } | ||||
209 | |||||
210 | # These options require bayes_scanner, which requires "use_bayes 1", but | ||||
211 | # that's not necessary for these commands. | ||||
212 | 1 | 6µs | if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} || | ||
213 | defined $opt{'backup'} || defined $opt{'restore'}) { | ||||
214 | $post_config .= "use_bayes 1\n"; | ||||
215 | } | ||||
216 | |||||
217 | 2 | 11µs | $post_config .= join("\n", @{$opt{'cf'}})."\n"; | ||
218 | |||||
219 | # create the tester factory | ||||
220 | $spamtest = new Mail::SpamAssassin( | ||||
221 | { | ||||
222 | rules_filename => $opt{'configpath'}, | ||||
223 | site_rules_filename => $opt{'siteconfigpath'}, | ||||
224 | userprefs_filename => $opt{'prefspath'}, | ||||
225 | username => $opt{'username'}, | ||||
226 | debug => $opt{'debug'}, | ||||
227 | 1 | 32µs | 1 | 48.7ms | local_tests_only => $opt{'local'}, # spent 48.7ms making 1 call to Mail::SpamAssassin::new |
228 | dont_copy_prefs => 1, | ||||
229 | PREFIX => $PREFIX, | ||||
230 | DEF_RULES_DIR => $DEF_RULES_DIR, | ||||
231 | LOCAL_RULES_DIR => $LOCAL_RULES_DIR, | ||||
232 | post_config_text => $post_config, | ||||
233 | } | ||||
234 | ); | ||||
235 | |||||
236 | 1 | 11µs | 1 | 10.7s | $spamtest->init(1); # spent 10.7s making 1 call to Mail::SpamAssassin::init |
237 | 1 | 8µs | 1 | 7µs | dbg("sa-learn: spamtest initialized"); # spent 7µs making 1 call to Mail::SpamAssassin::Logger::dbg |
238 | |||||
239 | # Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin; | ||||
240 | # To be resolved more cleanly!!! | ||||
241 | 1 | 9µs | if ($spamtest->{bayes_scanner}) { | ||
242 | 2 | 11µs | foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) { | ||
243 | 27 | 476µs | 27 | 120µs | if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) { # spent 120µs making 27 calls to UNIVERSAL::isa, avg 4µs/call |
244 | # copy plugin's "store" object ref one level up! | ||||
245 | 1 | 4µs | $spamtest->{bayes_scanner}->{store} = $plugin->{store}; | ||
246 | } | ||||
247 | } | ||||
248 | } | ||||
249 | |||||
250 | 1 | 9µs | 1 | 22µs | if (Mail::SpamAssassin::Util::am_running_on_windows()) { # spent 22µs making 1 call to Mail::SpamAssassin::Util::am_running_on_windows |
251 | binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363 | ||||
252 | binmode(STDOUT) or die "cannot set binmode on STDOUT: $!"; | ||||
253 | } | ||||
254 | |||||
255 | 1 | 4µs | if ( defined $opt{'dump'} ) { | ||
256 | my ( $magic, $toks ); | ||||
257 | |||||
258 | if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens! | ||||
259 | ( $magic, $toks ) = ( 1, 1 ); | ||||
260 | } | ||||
261 | elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only | ||||
262 | ( $magic, $toks ) = ( 1, 0 ); | ||||
263 | } | ||||
264 | elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only | ||||
265 | ( $magic, $toks ) = ( 0, 1 ); | ||||
266 | } | ||||
267 | else { # unknown option | ||||
268 | warn "Unknown dump option '" . $opt{'dump'} . "'\n"; | ||||
269 | $spamtest->finish_learner(); | ||||
270 | exit 1; | ||||
271 | } | ||||
272 | |||||
273 | if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) { | ||||
274 | $spamtest->finish_learner(); | ||||
275 | die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n"; | ||||
276 | } | ||||
277 | |||||
278 | $spamtest->finish_learner(); | ||||
279 | # make sure we notice any write errors while flushing output buffer | ||||
280 | close STDOUT or die "error closing STDOUT: $!"; | ||||
281 | close STDIN or die "error closing STDIN: $!"; | ||||
282 | exit 0; | ||||
283 | } | ||||
284 | |||||
285 | 1 | 3µs | if ( defined $opt{'import'} ) { | ||
286 | my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade(); | ||||
287 | $spamtest->finish_learner(); | ||||
288 | # make sure we notice any write errors while flushing output buffer | ||||
289 | close STDOUT or die "error closing STDOUT: $!"; | ||||
290 | close STDIN or die "error closing STDIN: $!"; | ||||
291 | exit( !$ret ); | ||||
292 | } | ||||
293 | |||||
294 | 1 | 3µs | if (defined $opt{'clear'}) { | ||
295 | unless ($spamtest->{bayes_scanner}->{store}->clear_database()) { | ||||
296 | $spamtest->finish_learner(); | ||||
297 | die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n"; | ||||
298 | } | ||||
299 | |||||
300 | $spamtest->finish_learner(); | ||||
301 | # make sure we notice any write errors while flushing output buffer | ||||
302 | close STDOUT or die "error closing STDOUT: $!"; | ||||
303 | close STDIN or die "error closing STDIN: $!"; | ||||
304 | exit 0; | ||||
305 | } | ||||
306 | |||||
307 | 1 | 2µs | if (defined $opt{'backup'}) { | ||
308 | unless ($spamtest->{bayes_scanner}->{store}->backup_database()) { | ||||
309 | $spamtest->finish_learner(); | ||||
310 | die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n"; | ||||
311 | } | ||||
312 | |||||
313 | $spamtest->finish_learner(); | ||||
314 | # make sure we notice any write errors while flushing output buffer | ||||
315 | close STDOUT or die "error closing STDOUT: $!"; | ||||
316 | close STDIN or die "error closing STDIN: $!"; | ||||
317 | exit 0; | ||||
318 | } | ||||
319 | |||||
320 | 1 | 3µs | if (defined $opt{'restore'}) { | ||
321 | |||||
322 | my $filename = $opt{'restore'}; | ||||
323 | |||||
324 | unless ($filename) { | ||||
325 | $spamtest->finish_learner(); | ||||
326 | die "ERROR: You must specify a filename to restore.\n"; | ||||
327 | } | ||||
328 | |||||
329 | unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) { | ||||
330 | $spamtest->finish_learner(); | ||||
331 | die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n"; | ||||
332 | } | ||||
333 | |||||
334 | $spamtest->finish_learner(); | ||||
335 | # make sure we notice any write errors while flushing output buffer | ||||
336 | close STDOUT or die "error closing STDOUT: $!"; | ||||
337 | close STDIN or die "error closing STDIN: $!"; | ||||
338 | exit 0; | ||||
339 | } | ||||
340 | |||||
341 | 1 | 4µs | if ( !$spamtest->{conf}->{use_bayes} ) { | ||
342 | warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n"; | ||||
343 | exit 1; | ||||
344 | } | ||||
345 | |||||
346 | $spamtest->init_learner( | ||||
347 | { | ||||
348 | force_expire => $opt{'force-expire'}, | ||||
349 | 1 | 23µs | 1 | 201µs | learn_to_journal => $opt{'nosync'}, # spent 201µs making 1 call to Mail::SpamAssassin::init_learner |
350 | wait_for_lock => 1, | ||||
351 | caller_will_untie => 1 | ||||
352 | } | ||||
353 | ); | ||||
354 | |||||
355 | 1 | 5µs | $spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'}; | ||
356 | |||||
357 | 1 | 2µs | if ($synconly) { | ||
358 | $spamtest->rebuild_learner_caches( | ||||
359 | { | ||||
360 | verbose => !$opt{'quiet'}, | ||||
361 | showdots => $opt{'showdots'} | ||||
362 | } | ||||
363 | ); | ||||
364 | $spamtest->finish_learner(); | ||||
365 | # make sure we notice any write errors while flushing output buffer | ||||
366 | close STDOUT or die "error closing STDOUT: $!"; | ||||
367 | close STDIN or die "error closing STDIN: $!"; | ||||
368 | exit 0; | ||||
369 | } | ||||
370 | |||||
371 | 1 | 4µs | $messagelimit = $opt{'stopafter'}; | ||
372 | 1 | 3µs | $learnprob = $opt{'learnprob'}; | ||
373 | |||||
374 | 1 | 3µs | if ( defined $opt{'randseed'} ) { | ||
375 | srand( $opt{'randseed'} ); | ||||
376 | } | ||||
377 | |||||
378 | # sync the journal first if we're going to go r/w so we make sure to | ||||
379 | # learn everything before doing anything else. | ||||
380 | # | ||||
381 | 1 | 2µs | if ( !$opt{nosync} ) { | ||
382 | $spamtest->rebuild_learner_caches(); | ||||
383 | } | ||||
384 | |||||
385 | # what is the result of the run? will end up being the exit code. | ||||
386 | 1 | 3µs | my $exit_status = 0; | ||
387 | |||||
388 | # run this lot in an eval block, so we can catch die's and clear | ||||
389 | # up the dbs. | ||||
390 | eval { | ||||
391 | 1 | 18µs | $SIG{HUP} = \&killed; | ||
392 | 1 | 7µs | $SIG{INT} = \&killed; | ||
393 | 1 | 8µs | $SIG{TERM} = \&killed; | ||
394 | |||||
395 | 1 | 2µs | if ( $opt{folders} ) { | ||
396 | open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!"; | ||||
397 | for ($!=0; <F>; $!=0) { | ||||
398 | chomp; | ||||
399 | next if /^\s*$/; | ||||
400 | if (/^(?:ham|spam):\w*:/) { | ||||
401 | push ( @targets, $_ ); | ||||
402 | } | ||||
403 | else { | ||||
404 | target($_); | ||||
405 | } | ||||
406 | } | ||||
407 | defined $_ || $!==0 or | ||||
408 | $!==EBADF ? dbg("error reading from $opt{folders}: $!") | ||||
409 | : die "error reading from $opt{folders}: $!"; | ||||
410 | close(F) or die "error closing $opt{folders}: $!"; | ||||
411 | } | ||||
412 | |||||
413 | ########################################################################### | ||||
414 | # Deal with the target listing, and STDIN -> tempfile | ||||
415 | |||||
416 | 1 | 2µs | my $tempfile; # will be defined if stdin -> tempfile | ||
417 | 1 | 4µs | push(@targets, @ARGV); | ||
418 | 1 | 2µs | @targets = ('-') unless @targets || $opt{folders}; | ||
419 | |||||
420 | 1 | 19µs | for(my $elem = 0; $elem <= $#targets; $elem++) { | ||
421 | # ArchiveIterator doesn't really like STDIN, so if "-" is specified | ||||
422 | # as a target, make it a temp file instead. | ||||
423 | 2 | 32µs | 2 | 13µs | if ( $targets[$elem] =~ /(?:^|:)-$/ ) { # spent 13µs making 2 calls to main::CORE:match, avg 7µs/call |
424 | if (defined $tempfile) { | ||||
425 | # uh-oh, stdin specified multiple times? | ||||
426 | warn "skipping extra stdin target (".$targets[$elem].")\n"; | ||||
427 | splice @targets, $elem, 1; | ||||
428 | $elem--; # go back to this element again | ||||
429 | next; | ||||
430 | } | ||||
431 | else { | ||||
432 | my $handle; | ||||
433 | ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile(); | ||||
434 | binmode $handle or die "cannot set binmode on file $tempfile: $!"; | ||||
435 | |||||
436 | # avoid slurping the whole file into memory, copy chunk by chunk | ||||
437 | my($inbuf,$nread); | ||||
438 | while ( $nread=sysread(STDIN,$inbuf,16384) ) | ||||
439 | { print {$handle} $inbuf or die "error writing to $tempfile: $!" } | ||||
440 | defined $nread or die "error reading from STDIN: $!"; | ||||
441 | close $handle or die "error closing $tempfile: $!"; | ||||
442 | |||||
443 | # re-aim the targets at the tempfile instead of STDIN | ||||
444 | $targets[$elem] =~ s/-$/$tempfile/; | ||||
445 | } | ||||
446 | } | ||||
447 | |||||
448 | # make sure the target list is in the normal AI format | ||||
449 | 2 | 32µs | 2 | 11µs | if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) { # spent 11µs making 2 calls to main::CORE:match, avg 6µs/call |
450 | my $item = splice @targets, $elem, 1; | ||||
451 | target($item); # add back to the list | ||||
452 | $elem--; # go back to this element again | ||||
453 | next; | ||||
454 | } | ||||
455 | } | ||||
456 | |||||
457 | ########################################################################### | ||||
458 | |||||
459 | my $iter = new Mail::SpamAssassin::ArchiveIterator( | ||||
460 | { | ||||
461 | # skip messages larger than max-size bytes, | ||||
462 | # 0 for no limit, undef defaults to 256 KB | ||||
463 | 'opt_max_size' => $opt{'max-size'}, | ||||
464 | 'opt_want_date' => 0, | ||||
465 | 'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex}, | ||||
466 | } | ||||
467 | 1 | 30µs | 1 | 47µs | ); # spent 47µs making 1 call to Mail::SpamAssassin::ArchiveIterator::new |
468 | |||||
469 | 1 | 12µs | 1 | 15µs | $iter->set_functions(\&wanted, \&result); # spent 15µs making 1 call to Mail::SpamAssassin::ArchiveIterator::set_functions |
470 | 1 | 3µs | $messagecount = 0; | ||
471 | 1 | 2µs | $learnedcount = 0; | ||
472 | |||||
473 | 1 | 2µs | $init_results = 0; | ||
474 | 1 | 4µs | $start_time = time; | ||
475 | |||||
476 | # if exit_status isn't already set to non-zero, set it to the reverse of the | ||||
477 | # run result (0 is bad, 1+ is good -- the opposite of exit status codes) | ||||
478 | 3 | 19µs | 1 | 715s | my $run_ok = eval { $exit_status ||= ! $iter->run(@targets); 1 }; # spent 715s making 1 call to Mail::SpamAssassin::ArchiveIterator::run |
479 | |||||
480 | 1 | 3µs | print STDERR "\n" if ($opt{showdots}); | ||
481 | 1 | 2µs | $progress->final() if ($opt{progress} && $progress); | ||
482 | |||||
483 | 1 | 4µs | my $phrase = defined $forget ? "Forgot" : "Learned"; | ||
484 | print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n" | ||||
485 | 1 | 54µs | 1 | 32µs | if !$opt{'quiet'}; # spent 32µs making 1 call to main::CORE:print |
486 | |||||
487 | # If we needed to make a tempfile, go delete it. | ||||
488 | 1 | 2µs | if (defined $tempfile) { | ||
489 | unlink $tempfile or die "cannot unlink temporary file $tempfile: $!"; | ||||
490 | undef $tempfile; | ||||
491 | } | ||||
492 | |||||
493 | 1 | 2µs | if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ } | ||
494 | 1 | 29µs | 1; | ||
495 | 1 | 5µs | } or do { | ||
496 | my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; | ||||
497 | $spamtest->finish_learner(); | ||||
498 | die $eval_stat; | ||||
499 | }; | ||||
500 | |||||
501 | 1 | 11µs | 1 | 5.67s | $spamtest->finish_learner(); # spent 5.67s making 1 call to Mail::SpamAssassin::finish_learner |
502 | # make sure we notice any write errors while flushing output buffer | ||||
503 | 1 | 26µs | 1 | 8µs | close STDOUT or die "error closing STDOUT: $!"; # spent 8µs making 1 call to main::CORE:close |
504 | 1 | 13µs | 1 | 4µs | close STDIN or die "error closing STDIN: $!"; # spent 4µs making 1 call to main::CORE:close |
505 | 1 | 87µs | exit $exit_status; | ||
506 | |||||
507 | ########################################################################### | ||||
508 | |||||
509 | sub killed { | ||||
510 | $spamtest->finish_learner(); | ||||
511 | die "interrupted"; | ||||
512 | } | ||||
513 | |||||
514 | # spent 56µs within main::target which was called 2 times, avg 28µs/call:
# 2 times (56µs+0s) by Getopt::Long::GetOptionsFromArray at line 737 of Getopt/Long.pm, avg 28µs/call | ||||
515 | 2 | 5µs | my ($target) = @_; | ||
516 | |||||
517 | 2 | 14µs | my $class = ( $isspam ? "spam" : "ham" ); | ||
518 | 2 | 5µs | my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" ); | ||
519 | |||||
520 | 2 | 10.1ms | push ( @targets, "$class:$format:$target" ); | ||
521 | } | ||||
522 | |||||
523 | ########################################################################### | ||||
524 | |||||
525 | # spent 10µs within main::init_results which was called:
# once (10µs+0s) by main::result at line 541 | ||||
526 | 1 | 2µs | $init_results = 1; | ||
527 | |||||
528 | 1 | 10µs | return unless $opt{'progress'}; | ||
529 | |||||
530 | $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES; | ||||
531 | |||||
532 | $progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,}); | ||||
533 | } | ||||
534 | |||||
535 | ########################################################################### | ||||
536 | |||||
537 | # spent 4.18ms (4.17+10µs) within main::result which was called 234 times, avg 18µs/call:
# 234 times (4.17ms+10µs) by Mail::SpamAssassin::ArchiveIterator::_run at line 326 of Mail/SpamAssassin/ArchiveIterator.pm, avg 18µs/call | ||||
538 | 234 | 1.34ms | my ($class, $result, $time) = @_; | ||
539 | |||||
540 | # don't open results files until we get here to avoid overwriting files | ||||
541 | 234 | 638µs | 1 | 10µs | &init_results if !$init_results; # spent 10µs making 1 call to main::init_results |
542 | |||||
543 | 234 | 2.27ms | $progress->update($messagecount) if ($opt{progress} && $progress); | ||
544 | } | ||||
545 | |||||
546 | ########################################################################### | ||||
547 | |||||
548 | # spent 715s (31.1ms+715) within main::wanted which was called 234 times, avg 3.05s/call:
# 234 times (31.1ms+715s) by Mail::SpamAssassin::ArchiveIterator::_run_file at line 414 of Mail/SpamAssassin/ArchiveIterator.pm, avg 3.05s/call | ||||
549 | 234 | 2.05ms | my ( $class, $id, $time, $dataref ) = @_; | ||
550 | |||||
551 | 234 | 860µs | my $spam = $class eq "s" ? 1 : 0; | ||
552 | |||||
553 | 234 | 578µs | if ( defined($learnprob) ) { | ||
554 | if ( int( rand( 1 / $learnprob ) ) != 0 ) { | ||||
555 | print STDERR '_' if ( $opt{showdots} ); | ||||
556 | return 1; | ||||
557 | } | ||||
558 | } | ||||
559 | |||||
560 | 234 | 505µs | if ( defined($messagelimit) && $learnedcount > $messagelimit ) { | ||
561 | $progress->final() if ($opt{progress} && $progress); | ||||
562 | die 'HITLIMIT'; | ||||
563 | } | ||||
564 | |||||
565 | 234 | 530µs | $messagecount++; | ||
566 | 234 | 2.80ms | 234 | 4.74s | my $ma = $spamtest->parse($dataref); # spent 4.74s making 234 calls to Mail::SpamAssassin::parse, avg 20.2ms/call |
567 | |||||
568 | 234 | 2.29ms | 234 | 18.6ms | if ( $ma->get_header("X-Spam-Checker-Version") ) { # spent 18.6ms making 234 calls to Mail::SpamAssassin::Message::Node::get_header, avg 80µs/call |
569 | my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); | ||||
570 | $ma->finish(); | ||||
571 | $ma = $new_ma; | ||||
572 | } | ||||
573 | |||||
574 | 234 | 2.45ms | 234 | 710s | my $status = $spamtest->learn( $ma, undef, $spam, $forget ); # spent 710s making 234 calls to Mail::SpamAssassin::learn, avg 3.03s/call |
575 | 234 | 2.51ms | 234 | 2.29ms | my $learned = $status->did_learn(); # spent 2.29ms making 234 calls to Mail::SpamAssassin::PerMsgLearner::did_learn, avg 10µs/call |
576 | |||||
577 | 234 | 1.21ms | if ( !defined $learned ) { # undef=learning unavailable | ||
578 | die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n"; | ||||
579 | } | ||||
580 | elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned | ||||
581 | 234 | 598µs | $learnedcount++; | ||
582 | } | ||||
583 | |||||
584 | # Do cleanup ... | ||||
585 | 234 | 2.04ms | 234 | 3.59ms | $status->finish(); # spent 3.59ms making 234 calls to Mail::SpamAssassin::PerMsgLearner::finish, avg 15µs/call |
586 | 234 | 937µs | undef $status; | ||
587 | |||||
588 | 234 | 2.21ms | 234 | 124ms | $ma->finish(); # spent 124ms making 234 calls to Mail::SpamAssassin::Message::finish, avg 530µs/call |
589 | 234 | 3.50ms | 79 | 1.45ms | undef $ma; # spent 1.45ms making 79 calls to Mail::SpamAssassin::Message::DESTROY, avg 18µs/call |
590 | |||||
591 | 234 | 946µs | print STDERR '.' if ( $opt{showdots} ); | ||
592 | 234 | 3.08ms | return 1; | ||
593 | } | ||||
594 | |||||
595 | ########################################################################### | ||||
596 | |||||
597 | sub usage { | ||||
598 | my ( $verbose, $message ) = @_; | ||||
599 | my $ver = Mail::SpamAssassin::Version(); | ||||
600 | print "SpamAssassin version $ver\n"; | ||||
601 | pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 ); | ||||
602 | } | ||||
603 | |||||
604 | # --------------------------------------------------------------------------- | ||||
605 | |||||
606 | =head1 NAME | ||||
607 | |||||
608 | sa-learn - train SpamAssassin's Bayesian classifier | ||||
609 | |||||
610 | =head1 SYNOPSIS | ||||
611 | |||||
612 | B<sa-learn> [options] [file]... | ||||
613 | |||||
614 | B<sa-learn> [options] --dump [ all | data | magic ] | ||||
615 | |||||
616 | Options: | ||||
617 | |||||
618 | --ham Learn messages as ham (non-spam) | ||||
619 | --spam Learn messages as spam | ||||
620 | --forget Forget a message | ||||
621 | --use-ignores Use bayes_ignore_from and bayes_ignore_to | ||||
622 | --sync Synchronize the database and the journal if needed | ||||
623 | --force-expire Force a database sync and expiry run | ||||
624 | --dbpath <path> Allows commandline override (in bayes_path form) | ||||
625 | for where to read the Bayes DB from | ||||
626 | --dump [all|data|magic] Display the contents of the Bayes database | ||||
627 | Takes optional argument for what to display | ||||
628 | --regexp <re> For dump only, specifies which tokens to | ||||
629 | dump based on a regular expression. | ||||
630 | -f file, --folders=file Read list of files/directories from file | ||||
631 | --dir Ignored; historical compatibility | ||||
632 | --file Ignored; historical compatibility | ||||
633 | --mbox Input sources are in mbox format | ||||
634 | --mbx Input sources are in mbx format | ||||
635 | --max-size <b> Skip messages larger than b bytes; | ||||
636 | defaults to 256 KB, 0 implies no limit | ||||
637 | --showdots Show progress using dots | ||||
638 | --progress Show progress using progress bar | ||||
639 | --no-sync Skip synchronizing the database and journal | ||||
640 | after learning | ||||
641 | -L, --local Operate locally, no network accesses | ||||
642 | --import Migrate data from older version/non DB_File | ||||
643 | based databases | ||||
644 | --clear Wipe out existing database | ||||
645 | --backup Backup, to STDOUT, existing database | ||||
646 | --restore <filename> Restore a database from filename | ||||
647 | -u username, --username=username | ||||
648 | Override username taken from the runtime | ||||
649 | environment, used with SQL | ||||
650 | -C path, --configpath=path, --config-file=path | ||||
651 | Path to standard configuration dir | ||||
652 | -p prefs, --prefspath=file, --prefs-file=file | ||||
653 | Set user preferences file | ||||
654 | --siteconfigpath=path Path for site configs | ||||
655 | (default: /etc/mail/spamassassin) | ||||
656 | --cf='config line' Additional line of configuration | ||||
657 | -D, --debug [area=n,...] Print debugging messages | ||||
658 | -V, --version Print version | ||||
659 | -h, --help Print usage message | ||||
660 | |||||
661 | =head1 DESCRIPTION | ||||
662 | |||||
663 | Given a typical selection of your incoming mail classified as spam or ham | ||||
664 | (non-spam), this tool will feed each mail to SpamAssassin, allowing it | ||||
665 | to 'learn' what signs are likely to mean spam, and which are likely to | ||||
666 | mean ham. | ||||
667 | |||||
668 | Simply run this command once for each of your mail folders, and it will | ||||
669 | ''learn'' from the mail therein. | ||||
670 | |||||
671 | Note that csh-style I<globbing> in the mail folder names is supported; | ||||
672 | in other words, listing a folder name as C<*> will scan every folder | ||||
673 | that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details. | ||||
674 | |||||
675 | SpamAssassin remembers which mail messages it has learnt already, and will not | ||||
676 | re-learn those messages again, unless you use the B<--forget> option. Messages | ||||
677 | learnt as spam will have SpamAssassin markup removed, on the fly. | ||||
678 | |||||
679 | If you make a mistake and scan a mail as ham when it is spam, or vice | ||||
680 | versa, simply rerun this command with the correct classification, and the | ||||
681 | mistake will be corrected. SpamAssassin will automatically 'forget' the | ||||
682 | previous indications. | ||||
683 | |||||
684 | Users of C<spamd> who wish to perform training remotely, over a network, | ||||
685 | should investigate the C<spamc -L> switch. | ||||
686 | |||||
687 | =head1 OPTIONS | ||||
688 | |||||
689 | =over 4 | ||||
690 | |||||
691 | =item B<--ham> | ||||
692 | |||||
693 | Learn the input message(s) as ham. If you have previously learnt any of the | ||||
694 | messages as spam, SpamAssassin will forget them first, then re-learn them as | ||||
695 | ham. Alternatively, if you have previously learnt them as ham, it'll skip them | ||||
696 | this time around. If the messages have already been filtered through | ||||
697 | SpamAssassin, the learner will ignore any modifications SpamAssassin may have | ||||
698 | made. | ||||
699 | |||||
700 | =item B<--spam> | ||||
701 | |||||
702 | Learn the input message(s) as spam. If you have previously learnt any of the | ||||
703 | messages as ham, SpamAssassin will forget them first, then re-learn them as | ||||
704 | spam. Alternatively, if you have previously learnt them as spam, it'll skip | ||||
705 | them this time around. If the messages have already been filtered through | ||||
706 | SpamAssassin, the learner will ignore any modifications SpamAssassin may have | ||||
707 | made. | ||||
708 | |||||
709 | =item B<--folders>=I<filename>, B<-f> I<filename> | ||||
710 | |||||
711 | sa-learn will read in the list of folders from the specified file, one folder | ||||
712 | per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>, | ||||
713 | sa-learn will learn that folder appropriately, otherwise the folders will be | ||||
714 | assumed to be of the type specified by B<--ham> or B<--spam>. | ||||
715 | |||||
716 | C<type> above is optional, but is the same as the standard for | ||||
717 | ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not | ||||
718 | specified). | ||||
719 | |||||
720 | =item B<--mbox> | ||||
721 | |||||
722 | sa-learn will read in the file(s) containing the emails to be learned, | ||||
723 | and will process them in mbox format (one or more emails per file). | ||||
724 | |||||
725 | =item B<--mbx> | ||||
726 | |||||
727 | sa-learn will read in the file(s) containing the emails to be learned, | ||||
728 | and will process them in mbx format (one or more emails per file). | ||||
729 | |||||
730 | =item B<--use-ignores> | ||||
731 | |||||
732 | Don't learn the message if a from address matches configuration file | ||||
733 | item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>. | ||||
734 | The option might be used when learning from a large file of messages | ||||
735 | from which the hammy spam messages or spammy ham messages have not | ||||
736 | been removed. | ||||
737 | |||||
738 | =item B<--sync> | ||||
739 | |||||
740 | Synchronize the journal and databases. Upon successfully syncing the | ||||
741 | database with the entries in the journal, the journal file is removed. | ||||
742 | |||||
743 | =item B<--force-expire> | ||||
744 | |||||
745 | Forces an expiry attempt, regardless of whether it may be necessary | ||||
746 | or not. Note: This doesn't mean any tokens will actually expire. | ||||
747 | Please see the EXPIRATION section below. | ||||
748 | |||||
749 | Note: C<--force-expire> also causes the journal data to be synchronized | ||||
750 | into the Bayes databases. | ||||
751 | |||||
752 | =item B<--forget> | ||||
753 | |||||
754 | Forget a given message previously learnt. | ||||
755 | |||||
756 | =item B<--dbpath> | ||||
757 | |||||
758 | Allows a commandline override of the I<bayes_path> configuration option. | ||||
759 | |||||
760 | =item B<--dump> I<option> | ||||
761 | |||||
762 | Display the contents of the Bayes database. Without an option or with | ||||
763 | the I<all> option, all magic tokens and data tokens will be displayed. | ||||
764 | I<magic> will only display magic tokens, and I<data> will only display | ||||
765 | the data tokens. | ||||
766 | |||||
767 | Can also use the B<--regexp> I<RE> option to specify which tokens to | ||||
768 | display based on a regular expression. | ||||
769 | |||||
770 | =item B<--clear> | ||||
771 | |||||
772 | Clear an existing Bayes database by removing all traces of the database. | ||||
773 | |||||
774 | WARNING: This is destructive and should be used with care. | ||||
775 | |||||
776 | =item B<--backup> | ||||
777 | |||||
778 | Performs a dump of the Bayes database in machine/human readable format. | ||||
779 | |||||
780 | The dump will include token and seen data. It is suitable for input back | ||||
781 | into the --restore command. | ||||
782 | |||||
783 | =item B<--restore>=I<filename> | ||||
784 | |||||
785 | Performs a restore of the Bayes database defined by I<filename>. | ||||
786 | |||||
787 | WARNING: This is a destructive operation, previous Bayes data will be wiped out. | ||||
788 | |||||
789 | =item B<-h>, B<--help> | ||||
790 | |||||
791 | Print help message and exit. | ||||
792 | |||||
793 | =item B<-u> I<username>, B<--username>=I<username> | ||||
794 | |||||
795 | If specified this username will override the username taken from the runtime | ||||
796 | environment. You can use this option to specify users in a virtual user | ||||
797 | configuration when using SQL as the Bayes backend. | ||||
798 | |||||
799 | NOTE: This option will not change to the given I<username>, it will only attempt | ||||
800 | to act on behalf of that user. Because of this you will need to have proper | ||||
801 | permissions to be able to change files owned by I<username>. In the case of SQL | ||||
802 | this generally is not a problem. | ||||
803 | |||||
804 | =item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path> | ||||
805 | |||||
806 | Use the specified path for locating the distributed configuration files. | ||||
807 | Ignore the default directories (usually C</usr/share/spamassassin> or similar). | ||||
808 | |||||
809 | =item B<--siteconfigpath>=I<path> | ||||
810 | |||||
811 | Use the specified path for locating site-specific configuration files. Ignore | ||||
812 | the default directories (usually C</etc/mail/spamassassin> or similar). | ||||
813 | |||||
814 | =item B<--cf='config line'> | ||||
815 | |||||
816 | Add additional lines of configuration directly from the command-line, parsed | ||||
817 | after the configuration files are read. Multiple B<--cf> arguments can be | ||||
818 | used, and each will be considered a separate line of configuration. | ||||
819 | |||||
820 | =item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs> | ||||
821 | |||||
822 | Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>). | ||||
823 | |||||
824 | =item B<--progress> | ||||
825 | |||||
826 | Prints a progress bar (to STDERR) showing the current progress. In the case | ||||
827 | where no valid terminal is found this option will behave very much like the | ||||
828 | --showdots option. | ||||
829 | |||||
830 | =item B<-D> [I<area,...>], B<--debug> [I<area,...>] | ||||
831 | |||||
832 | Produce debugging output. If no areas are listed, all debugging information is | ||||
833 | printed. Diagnostic output can also be enabled for each area individually; | ||||
834 | I<area> is the area of the code to instrument. For example, to produce | ||||
835 | diagnostic output on bayes, learn, and dns, use: | ||||
836 | |||||
837 | spamassassin -D bayes,learn,dns | ||||
838 | |||||
839 | For more information about which areas (also known as channels) are available, | ||||
840 | please see the documentation at: | ||||
841 | |||||
842 | C<http://wiki.apache.org/spamassassin/DebugChannels> | ||||
843 | |||||
844 | Higher priority informational messages that are suitable for logging in normal | ||||
845 | circumstances are available with an area of "info". | ||||
846 | |||||
847 | =item B<--no-sync> | ||||
848 | |||||
849 | Skip the slow synchronization step which normally takes place after | ||||
850 | changing database entries. If you plan to learn from many folders in | ||||
851 | a batch, or to learn many individual messages one-by-one, it is faster | ||||
852 | to use this switch and run C<sa-learn --sync> once all the folders have | ||||
853 | been scanned. | ||||
854 | |||||
855 | Clarification: The state of I<--no-sync> overrides the | ||||
856 | I<bayes_learn_to_journal> configuration option. If not specified, | ||||
857 | sa-learn will learn to the database directly. If specified, sa-learn | ||||
858 | will learn to the journal file. | ||||
859 | |||||
860 | Note: I<--sync> and I<--no-sync> can be specified on the same commandline, | ||||
861 | which is slightly confusing. In this case, the I<--no-sync> option is | ||||
862 | ignored since there is no learn operation. | ||||
863 | |||||
864 | =item B<-L>, B<--local> | ||||
865 | |||||
866 | Do not perform any network accesses while learning details about the mail | ||||
867 | messages. This will speed up the learning process, but may result in a | ||||
868 | slightly lower accuracy. | ||||
869 | |||||
870 | Note that this is currently ignored, as current versions of SpamAssassin will | ||||
871 | not perform network access while learning; but future versions may. | ||||
872 | |||||
873 | =item B<--import> | ||||
874 | |||||
875 | If you previously used SpamAssassin's Bayesian learner without the C<DB_File> | ||||
876 | module installed, it will have created files in other formats, such as | ||||
877 | C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate | ||||
878 | that old data into the C<DB_File> format. It will overwrite any data currently | ||||
879 | in the C<DB_File>. | ||||
880 | |||||
881 | Can also be used with the B<--dbpath> I<path> option to specify the location of | ||||
882 | the Bayes files to use. | ||||
883 | |||||
884 | =back | ||||
885 | |||||
886 | =head1 MIGRATION | ||||
887 | |||||
888 | There are now multiple backend storage modules available for storing | ||||
889 | user's bayesian data. As such you might want to migrate from one | ||||
890 | backend to another. Here is a simple procedure for migrating from one | ||||
891 | backend to another. | ||||
892 | |||||
893 | Note that if you have individual user databases you will have to | ||||
894 | perform a similar procedure for each one of them. | ||||
895 | |||||
896 | =over 4 | ||||
897 | |||||
898 | =item sa-learn --sync | ||||
899 | |||||
900 | This will sync any outstanding journal entries | ||||
901 | |||||
902 | =item sa-learn --backup > backup.txt | ||||
903 | |||||
904 | This will save all your Bayes data to a plain text file. | ||||
905 | |||||
906 | =item sa-learn --clear | ||||
907 | |||||
908 | This is optional, but good to do to clear out the old database. | ||||
909 | |||||
910 | =item Repeat! | ||||
911 | |||||
912 | At this point, if you have multiple databases, you should perform the | ||||
913 | procedure above for each of them. (i.e. each user's database needs to | ||||
914 | be backed up before continuing.) | ||||
915 | |||||
916 | =item Switch backends | ||||
917 | |||||
918 | Once you have backed up all databases you can update your | ||||
919 | configuration for the new database backend. This will involve at least | ||||
920 | the bayes_store_module config option and may involve some additional | ||||
921 | config options depending on what is required by the module. (For | ||||
922 | example, you may need to configure an SQL database.) | ||||
923 | |||||
924 | =item sa-learn --restore backup.txt | ||||
925 | |||||
926 | Again, you need to do this for every database. | ||||
927 | |||||
928 | =back | ||||
929 | |||||
930 | If you are migrating to SQL you can make use of the -u <username> | ||||
931 | option in sa-learn to populate each user's database. Otherwise, you | ||||
932 | must run sa-learn as the user who database you are restoring. | ||||
933 | |||||
934 | |||||
935 | =head1 INTRODUCTION TO BAYESIAN FILTERING | ||||
936 | |||||
937 | (Thanks to Michael Bell for this section!) | ||||
938 | |||||
939 | For a more lengthy description of how this works, go to | ||||
940 | http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably | ||||
941 | readable, even if statistics make me break out in hives. | ||||
942 | |||||
943 | The short semi-inaccurate version: Given training, a spam heuristics engine | ||||
944 | can take the most "spammy" and "hammy" words and apply probabilistic | ||||
945 | analysis. Furthermore, once given a basis for the analysis, the engine can | ||||
946 | continue to learn iteratively by applying both the non-Bayesian and Bayesian | ||||
947 | rulesets together to create evolving "intelligence". | ||||
948 | |||||
949 | SpamAssassin 2.50 and later supports Bayesian spam analysis, in | ||||
950 | the form of the BAYES rules. This is a new feature, quite powerful, | ||||
951 | and is disabled until enough messages have been learnt. | ||||
952 | |||||
953 | The pros of Bayesian spam analysis: | ||||
954 | |||||
955 | =over 4 | ||||
956 | |||||
957 | =item Can greatly reduce false positives and false negatives. | ||||
958 | |||||
959 | It learns from your mail, so it is tailored to your unique e-mail flow. | ||||
960 | |||||
961 | =item Once it starts learning, it can continue to learn from SpamAssassin | ||||
962 | and improve over time. | ||||
963 | |||||
964 | =back | ||||
965 | |||||
966 | And the cons: | ||||
967 | |||||
968 | =over 4 | ||||
969 | |||||
970 | =item A decent number of messages are required before results are useful | ||||
971 | for ham/spam determination. | ||||
972 | |||||
973 | =item It's hard to explain why a message is or isn't marked as spam. | ||||
974 | |||||
975 | i.e.: a straightforward rule, that matches, say, "VIAGRA" is | ||||
976 | easy to understand. If it generates a false positive or false negative, | ||||
977 | it is fairly easy to understand why. | ||||
978 | |||||
979 | With Bayesian analysis, it's all probabilities - "because the past says | ||||
980 | it is likely as this falls into a probabilistic distribution common to past | ||||
981 | spam in your systems". Tell that to your users! Tell that to the client | ||||
982 | when he asks "what can I do to change this". (By the way, the answer in | ||||
983 | this case is "use whitelisting".) | ||||
984 | |||||
985 | =item It will take disk space and memory. | ||||
986 | |||||
987 | The databases it maintains take quite a lot of resources to store and use. | ||||
988 | |||||
989 | =back | ||||
990 | |||||
991 | =head1 GETTING STARTED | ||||
992 | |||||
993 | Still interested? Ok, here's the guidelines for getting this working. | ||||
994 | |||||
995 | First a high-level overview: | ||||
996 | |||||
997 | =over 4 | ||||
998 | |||||
999 | =item Build a significant sample of both ham and spam. | ||||
1000 | |||||
1001 | I suggest several thousand of each, placed in SPAM and HAM directories or | ||||
1002 | mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much | ||||
1003 | better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY | ||||
1004 | message. You're urged to avoid using a publicly available corpus (sample) - | ||||
1005 | this must be taken from YOUR mail server, if it is to be statistically useful. | ||||
1006 | Otherwise, the results may be pretty skewed. | ||||
1007 | |||||
1008 | =item Use this tool to teach SpamAssassin about these samples, like so: | ||||
1009 | |||||
1010 | sa-learn --spam /path/to/spam/folder | ||||
1011 | sa-learn --ham /path/to/ham/folder | ||||
1012 | ... | ||||
1013 | |||||
1014 | Let SpamAssassin proceed, learning stuff. When it finds ham and spam | ||||
1015 | it will add the "interesting tokens" to the database. | ||||
1016 | |||||
1017 | =item If you need SpamAssassin to forget about specific messages, use | ||||
1018 | the B<--forget> option. | ||||
1019 | |||||
1020 | This can be applied to either ham or spam that has run through the | ||||
1021 | B<sa-learn> processes. It's a bit of a hammer, really, lowering the | ||||
1022 | weighting of the specific tokens in that message (only if that message has | ||||
1023 | been processed before). | ||||
1024 | |||||
1025 | =item Learning from single messages uses a command like this: | ||||
1026 | |||||
1027 | sa-learn --ham --no-sync mailmessage | ||||
1028 | |||||
1029 | This is handy for binding to a key in your mail user agent. It's very fast, as | ||||
1030 | all the time-consuming stuff is deferred until you run with the C<--sync> | ||||
1031 | option. | ||||
1032 | |||||
1033 | =item Autolearning is enabled by default | ||||
1034 | |||||
1035 | If you don't have a corpus of mail saved to learn, you can let | ||||
1036 | SpamAssassin automatically learn the mail that you receive. If you are | ||||
1037 | autolearning from scratch, the amount of mail you receive will determine | ||||
1038 | how long until the BAYES_* rules are activated. | ||||
1039 | |||||
1040 | =back | ||||
1041 | |||||
1042 | =head1 EFFECTIVE TRAINING | ||||
1043 | |||||
1044 | Learning filters require training to be effective. If you don't train | ||||
1045 | them, they won't work. In addition, you need to train them with new | ||||
1046 | messages regularly to keep them up-to-date, or their data will become | ||||
1047 | stale and impact accuracy. | ||||
1048 | |||||
1049 | You need to train with both spam I<and> ham mails. One type of mail | ||||
1050 | alone will not have any effect. | ||||
1051 | |||||
1052 | Note that if your mail folders contain things like forwarded spam, | ||||
1053 | discussions of spam-catching rules, etc., this will cause trouble. You | ||||
1054 | should avoid scanning those messages if possible. (An easy way to do this | ||||
1055 | is to move them aside, into a folder which is not scanned.) | ||||
1056 | |||||
1057 | If the messages you are learning from have already been filtered through | ||||
1058 | SpamAssassin, the learner will compensate for this. In effect, it learns what | ||||
1059 | each message would look like if you had run C<spamassassin -d> over it in | ||||
1060 | advance. | ||||
1061 | |||||
1062 | Another thing to be aware of, is that typically you should aim to train | ||||
1063 | with at least 1000 messages of spam, and 1000 ham messages, if | ||||
1064 | possible. More is better, but anything over about 5000 messages does not | ||||
1065 | improve accuracy significantly in our tests. | ||||
1066 | |||||
1067 | Be careful that you train from the same source -- for example, if you train | ||||
1068 | on old spam, but new ham mail, then the classifier will think that | ||||
1069 | a mail with an old date stamp is likely to be spam. | ||||
1070 | |||||
1071 | It's also worth noting that training with a very small quantity of | ||||
1072 | ham, will produce atrocious results. You should aim to train with at | ||||
1073 | least the same amount (or more if possible!) of ham data than spam. | ||||
1074 | |||||
1075 | On an on-going basis, it is best to keep training the filter to make | ||||
1076 | sure it has fresh data to work from. There are various ways to do | ||||
1077 | this: | ||||
1078 | |||||
1079 | =over 4 | ||||
1080 | |||||
1081 | =item 1. Supervised learning | ||||
1082 | |||||
1083 | This means keeping a copy of all or most of your mail, separated into spam | ||||
1084 | and ham piles, and periodically re-training using those. It produces | ||||
1085 | the best results, but requires more work from you, the user. | ||||
1086 | |||||
1087 | (An easy way to do this, by the way, is to create a new folder for | ||||
1088 | 'deleted' messages, and instead of deleting them from other folders, | ||||
1089 | simply move them in there instead. Then keep all spam in a separate | ||||
1090 | folder and never delete it. As long as you remember to move misclassified | ||||
1091 | mails into the correct folder set, it is easy enough to keep up to date.) | ||||
1092 | |||||
1093 | =item 2. Unsupervised learning from Bayesian classification | ||||
1094 | |||||
1095 | Another way to train is to chain the results of the Bayesian classifier | ||||
1096 | back into the training, so it reinforces its own decisions. This is only | ||||
1097 | safe if you then retrain it based on any errors you discover. | ||||
1098 | |||||
1099 | SpamAssassin does not support this method, due to experimental results | ||||
1100 | which strongly indicate that it does not work well, and since Bayes is | ||||
1101 | only one part of the resulting score presented to the user (while Bayes | ||||
1102 | may have made the wrong decision about a mail, it may have been overridden | ||||
1103 | by another system). | ||||
1104 | |||||
1105 | =item 3. Unsupervised learning from SpamAssassin rules | ||||
1106 | |||||
1107 | Also called 'auto-learning' in SpamAssassin. Based on statistical | ||||
1108 | analysis of the SpamAssassin success rates, we can automatically train the | ||||
1109 | Bayesian database with a certain degree of confidence that our training | ||||
1110 | data is accurate. | ||||
1111 | |||||
1112 | It should be supplemented with some supervised training in addition, if | ||||
1113 | possible. | ||||
1114 | |||||
1115 | This is the default, but can be turned off by setting the SpamAssassin | ||||
1116 | configuration parameter C<bayes_auto_learn> to 0. | ||||
1117 | |||||
1118 | =item 4. Mistake-based training | ||||
1119 | |||||
1120 | This means training on a small number of mails, then only training on | ||||
1121 | messages that SpamAssassin classifies incorrectly. This works, but it | ||||
1122 | takes longer to get it right than a full training session would. | ||||
1123 | |||||
1124 | =back | ||||
1125 | |||||
1126 | =head1 FILES | ||||
1127 | |||||
1128 | B<sa-learn> and the other parts of SpamAssassin's Bayesian learner, | ||||
1129 | use a set of persistent database files to store the learnt tokens, as follows. | ||||
1130 | |||||
1131 | =over 4 | ||||
1132 | |||||
1133 | =item bayes_toks | ||||
1134 | |||||
1135 | The database of tokens, containing the tokens learnt, their count of | ||||
1136 | occurrences in ham and spam, and the timestamp when the token was last | ||||
1137 | seen in a message. | ||||
1138 | |||||
1139 | This database also contains some 'magic' tokens, as follows: the version | ||||
1140 | number of the database, the number of ham and spam messages learnt, the | ||||
1141 | number of tokens in the database, and timestamps of: the last journal | ||||
1142 | sync, the last expiry run, the last expiry token reduction count, the | ||||
1143 | last expiry timestamp delta, the oldest token timestamp in the database, | ||||
1144 | and the newest token timestamp in the database. | ||||
1145 | |||||
1146 | This is a database file, using C<DB_File>. The database 'version | ||||
1147 | number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x | ||||
1148 | development releases, 2 for 2.6x, and 3 for 3.0 and later releases. | ||||
1149 | |||||
1150 | =item bayes_seen | ||||
1151 | |||||
1152 | A map of Message-Id and some data from headers and body to what that | ||||
1153 | message was learnt as. This is used so that SpamAssassin can avoid | ||||
1154 | re-learning a message it has already seen, and so it can reverse the | ||||
1155 | training if you later decide that message was learnt incorrectly. | ||||
1156 | |||||
1157 | This is a database file, using C<DB_File>. | ||||
1158 | |||||
1159 | =item bayes_journal | ||||
1160 | |||||
1161 | While SpamAssassin is scanning mails, it needs to track which tokens | ||||
1162 | it uses in its calculations. To avoid the contention of having each | ||||
1163 | SpamAssassin process attempting to gain write access to the Bayes DB, | ||||
1164 | the token timestamps are written to a 'journal' file which will later | ||||
1165 | (either automatically or via C<sa-learn --sync>) be used to synchronize | ||||
1166 | the Bayes DB. | ||||
1167 | |||||
1168 | Also, through the use of C<bayes_learn_to_journal>, or when using the | ||||
1169 | C<--no-sync> option with sa-learn, the actual learning data will take | ||||
1170 | be placed into the journal for later synchronization. This is typically | ||||
1171 | useful for high-traffic sites to avoid the same contention as stated | ||||
1172 | above. | ||||
1173 | |||||
1174 | =back | ||||
1175 | |||||
1176 | =head1 EXPIRATION | ||||
1177 | |||||
1178 | Since SpamAssassin can auto-learn messages, the Bayes database files | ||||
1179 | could increase perpetually until they fill your disk. To control this, | ||||
1180 | SpamAssassin performs journal synchronization and bayes expiration | ||||
1181 | periodically when certain criteria (listed below) are met. | ||||
1182 | |||||
1183 | SpamAssassin can sync the journal and expire the DB tokens either | ||||
1184 | manually or opportunistically. A journal sync is due if I<--sync> | ||||
1185 | is passed to sa-learn (manual), or if the following is true | ||||
1186 | (opportunistic): | ||||
1187 | |||||
1188 | =over 4 | ||||
1189 | |||||
1190 | =item - bayes_journal_max_size does not equal 0 (means don't sync) | ||||
1191 | |||||
1192 | =item - the journal file exists | ||||
1193 | |||||
1194 | =back | ||||
1195 | |||||
1196 | and either: | ||||
1197 | |||||
1198 | =over 4 | ||||
1199 | |||||
1200 | =item - the journal file has a size greater than bayes_journal_max_size | ||||
1201 | |||||
1202 | =back | ||||
1203 | |||||
1204 | or | ||||
1205 | |||||
1206 | =over 4 | ||||
1207 | |||||
1208 | =item - a journal sync has previously occurred, and at least 1 day has | ||||
1209 | passed since that sync | ||||
1210 | |||||
1211 | =back | ||||
1212 | |||||
1213 | Expiry is due if I<--force-expire> is passed to sa-learn (manual), | ||||
1214 | or if all of the following are true (opportunistic): | ||||
1215 | |||||
1216 | =over 4 | ||||
1217 | |||||
1218 | =item - the last expire was attempted at least 12hrs ago | ||||
1219 | |||||
1220 | =item - bayes_auto_expire does not equal 0 | ||||
1221 | |||||
1222 | =item - the number of tokens in the DB is > 100,000 | ||||
1223 | |||||
1224 | =item - the number of tokens in the DB is > bayes_expiry_max_db_size | ||||
1225 | |||||
1226 | =item - there is at least a 12 hr difference between the oldest and newest token atimes | ||||
1227 | |||||
1228 | =back | ||||
1229 | |||||
1230 | =head2 EXPIRE LOGIC | ||||
1231 | |||||
1232 | If either the manual or opportunistic method causes an expire run | ||||
1233 | to start, here is the logic that is used: | ||||
1234 | |||||
1235 | =over 4 | ||||
1236 | |||||
1237 | =item - figure out how many tokens to keep. take the larger of | ||||
1238 | either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal | ||||
1239 | reduction is number of tokens - number of tokens to keep. | ||||
1240 | |||||
1241 | =item - if the reduction number is < 1000 tokens, abort (not worth the effort). | ||||
1242 | |||||
1243 | =item - if an expire has been done before, guesstimate the new | ||||
1244 | atime delta based on the old atime delta. (new_atime_delta = | ||||
1245 | old_atime_delta * old_reduction_count / goal) | ||||
1246 | |||||
1247 | =item - if no expire has been done before, or the last expire looks | ||||
1248 | "weird", do an estimation pass. The definition of "weird" is: | ||||
1249 | |||||
1250 | =over 8 | ||||
1251 | |||||
1252 | =item - last expire over 30 days ago | ||||
1253 | |||||
1254 | =item - last atime delta was < 12 hrs | ||||
1255 | |||||
1256 | =item - last reduction count was < 1000 tokens | ||||
1257 | |||||
1258 | =item - estimated new atime delta is < 12 hrs | ||||
1259 | |||||
1260 | =item - the difference between the last reduction count and the goal reduction count is > 50% | ||||
1261 | |||||
1262 | =back | ||||
1263 | |||||
1264 | =back | ||||
1265 | |||||
1266 | =head2 ESTIMATION PASS LOGIC | ||||
1267 | |||||
1268 | Go through each of the DB's tokens. Starting at 12hrs, calculate | ||||
1269 | whether or not the token would be expired (based on the difference | ||||
1270 | between the token's atime and the db's newest token atime) and keep | ||||
1271 | the count. Work out from 12hrs exponentially by powers of 2. ie: | ||||
1272 | 12hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs | ||||
1273 | * 512 (6144hrs, or 256 days). | ||||
1274 | |||||
1275 | The larger the delta, the smaller the number of tokens that will | ||||
1276 | be expired. Conversely, the number of tokens goes up as the delta | ||||
1277 | gets smaller. So starting at the largest atime delta, figure out | ||||
1278 | which delta will expire the most tokens without going above the | ||||
1279 | goal expiration count. Use this to choose the atime delta to use, | ||||
1280 | unless one of the following occurs: | ||||
1281 | |||||
1282 | =over 8 | ||||
1283 | |||||
1284 | =item - the largest atime (smallest reduction count) would expire | ||||
1285 | too many tokens. this means the learned tokens are mostly old and | ||||
1286 | there needs to be new tokens learned before an expire can | ||||
1287 | occur. | ||||
1288 | |||||
1289 | =item - all of the atime choices result in 0 tokens being removed. | ||||
1290 | this means the tokens are all newer than 12 hours and there needs | ||||
1291 | to be new tokens learned before an expire can occur. | ||||
1292 | |||||
1293 | =item - the number of tokens that would be removed is < 1000. the | ||||
1294 | benefit isn't worth the effort. more tokens need to be learned. | ||||
1295 | |||||
1296 | =back | ||||
1297 | |||||
1298 | If the expire run gets past this point, it will continue to the end. | ||||
1299 | A new DB is created since the majority of DB libraries don't shrink the | ||||
1300 | DB file when tokens are removed. So we do the "create new, migrate old | ||||
1301 | to new, remove old, rename new" shuffle. | ||||
1302 | |||||
1303 | =head2 EXPIRY RELATED CONFIGURATION SETTINGS | ||||
1304 | |||||
1305 | =over 4 | ||||
1306 | |||||
1307 | =item C<bayes_auto_expire> is used to specify whether or not SpamAssassin | ||||
1308 | ought to opportunistically attempt to expire the Bayes database. | ||||
1309 | The default is 1 (yes). | ||||
1310 | |||||
1311 | =item C<bayes_expiry_max_db_size> specifies both the auto-expire token | ||||
1312 | count point, as well as the resulting number of tokens after expiry | ||||
1313 | as described above. The default value is 150,000, which is roughly | ||||
1314 | equivalent to a 6Mb database file if you're using DB_File. | ||||
1315 | |||||
1316 | =item C<bayes_journal_max_size> specifies how large the Bayes | ||||
1317 | journal will grow before it is opportunistically synced. The | ||||
1318 | default value is 102400. | ||||
1319 | |||||
1320 | =back | ||||
1321 | |||||
1322 | =head1 INSTALLATION | ||||
1323 | |||||
1324 | The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module. | ||||
1325 | Install this as a normal Perl module, using C<perl -MCPAN -e shell>, | ||||
1326 | or by hand. | ||||
1327 | |||||
1328 | =head1 SEE ALSO | ||||
1329 | |||||
1330 | spamassassin(1) | ||||
1331 | spamc(1) | ||||
1332 | Mail::SpamAssassin(3) | ||||
1333 | Mail::SpamAssassin::ArchiveIterator(3) | ||||
1334 | |||||
1335 | E<lt>http://www.paulgraham.com/E<gt> | ||||
1336 | Paul Graham's "A Plan For Spam" paper | ||||
1337 | |||||
1338 | E<lt>http://www.linuxjournal.com/article/6467E<gt> | ||||
1339 | Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin | ||||
1340 | |||||
1341 | E<lt>http://www.bgl.nu/~glouis/bogofilter/E<gt> | ||||
1342 | 'Training on error' page. A discussion of various Bayes training regimes, | ||||
1343 | including 'train on error' and unsupervised training. | ||||
1344 | |||||
1345 | =head1 PREREQUISITES | ||||
1346 | |||||
1347 | C<Mail::SpamAssassin> | ||||
1348 | |||||
1349 | =head1 AUTHORS | ||||
1350 | |||||
1351 | The SpamAssassin(tm) Project E<lt>http://spamassassin.apache.org/E<gt> | ||||
1352 | |||||
1353 | =cut | ||||
1354 | |||||
# spent 9.15ms within Encode::XS::decode which was called 1038 times, avg 9µs/call:
# 1038 times (9.15ms+0s) by Net::DNS::Domain::_decode_ascii at line 299 of Net/DNS/Domain.pm, avg 9µs/call | |||||
# spent 970µs within Internals::SvREADONLY which was called 148 times, avg 7µs/call:
# 146 times (955µs+0s) by constant::import at line 164 of constant.pm, avg 7µs/call
# once (12µs+0s) by constant::BEGIN@24 at line 33 of constant.pm
# once (2µs+0s) by constant::BEGIN@24 at line 34 of constant.pm | |||||
# spent 127µs within UNIVERSAL::VERSION which was called 6 times, avg 21µs/call:
# once (30µs+0s) by NetAddr::IP::BEGIN@8 at line 8 of NetAddr/IP.pm
# once (23µs+0s) by Encode::BEGIN@12 at line 12 of Encode.pm
# once (22µs+0s) by Pod::Simple::BEGIN@8 at line 8 of Pod/Simple.pm
# once (19µs+0s) by Mail::SpamAssassin::Util::BEGIN@76 at line 76 of Mail/SpamAssassin/Util.pm
# once (19µs+0s) by Mail::SpamAssassin::NetSet::BEGIN@26 at line 26 of Mail/SpamAssassin/NetSet.pm
# once (14µs+0s) by NetAddr::IP::BEGIN@9 at line 21 of NetAddr/IP.pm | |||||
# spent 16.7ms within UNIVERSAL::can which was called 3017 times, avg 6µs/call:
# 1968 times (9.54ms+0s) by Mail::SpamAssassin::DnsResolver::new_dns_packet at line 602 of Mail/SpamAssassin/DnsResolver.pm, avg 5µs/call
# 324 times (2.44ms+0s) by Mail::SpamAssassin::PluginHandler::have_callback at line 166 of Mail/SpamAssassin/PluginHandler.pm, avg 8µs/call
# 234 times (2.02ms+0s) by Mail::SpamAssassin::Message::Metadata::parse_received_headers at line 272 of Mail/SpamAssassin/Message/Metadata/Received.pm, avg 9µs/call
# 234 times (1.07ms+0s) by Mail::SpamAssassin::Message::Metadata::parse_received_headers at line 278 of Mail/SpamAssassin/Message/Metadata/Received.pm, avg 5µs/call
# 189 times (915µs+0s) by Mail::SpamAssassin::HTML::parse at line 250 of Mail/SpamAssassin/HTML.pm, avg 5µs/call
# 55 times (570µs+0s) by Mail::SpamAssassin::Conf::Parser::cond_clause_can_or_has at line 595 of Mail/SpamAssassin/Conf/Parser.pm, avg 10µs/call
# 6 times (37µs+0s) by Mail::SpamAssassin::Util::reverse_ip_address at line 906 of Mail/SpamAssassin/Util.pm, avg 6µs/call
# 3 times (44µs+0s) by IO::Socket::SSL::BEGIN@389 at line 399 of IO/Socket/SSL.pm, avg 15µs/call
# once (6µs+0s) by Mail::SpamAssassin::DnsResolver::configured_nameservers at line 213 of Mail/SpamAssassin/DnsResolver.pm
# once (6µs+0s) by Mail::SpamAssassin::DnsResolver::configured_nameservers at line 212 of Mail/SpamAssassin/DnsResolver.pm
# once (5µs+0s) by Mail::SpamAssassin::AsyncLoop::BEGIN@49 at line 52 of Mail/SpamAssassin/AsyncLoop.pm
# once (5µs+0s) by Net::DNS::Domain::BEGIN@54 at line 1 of (eval 27)[Net/DNS/Domain.pm:54] | |||||
# spent 280µs within UNIVERSAL::isa which was called 57 times, avg 5µs/call:
# 27 times (139µs+0s) by base::import at line 97 of base.pm, avg 5µs/call
# 27 times (120µs+0s) by main::RUNTIME at line 243, avg 4µs/call
# 2 times (16µs+0s) by File::Path::mkpath at line 94 of File/Path.pm, avg 8µs/call
# once (5µs+0s) by Getopt::Long::GetOptionsFromArray at line 474 of Getopt/Long.pm | |||||
sub main::CORE:close; # opcode | |||||
# spent 47µs within main::CORE:ftis which was called 2 times, avg 24µs/call:
# 2 times (47µs+0s) by main::BEGIN@41 at line 46, avg 24µs/call | |||||
sub main::CORE:match; # opcode | |||||
# spent 142µs within main::CORE:pack which was called 24 times, avg 6µs/call:
# 2 times (12µs+0s) by Net::DNS::Resolver::Base::BEGIN@33 at line 297 of IO/Socket/INET6.pm, avg 6µs/call
# once (15µs+0s) by Net::DNS::RR::BEGIN@42 at line 50 of Net/DNS/Domain.pm
# once (12µs+0s) by NetAddr::IP::BEGIN@8 at line 201 of NetAddr/IP/Lite.pm
# once (9µs+0s) by Mail::SpamAssassin::PerMsgStatus::BEGIN@35 at line 319 of IO/Socket.pm
# once (8µs+0s) by NetAddr::IP::Lite::BEGIN@18 at line 153 of NetAddr/IP/Util.pm
# once (8µs+0s) by Net::DNS::Resolver::Base::BEGIN@1.1 at line 523 of IO/Socket/IP.pm
# once (8µs+0s) by NetAddr::IP::Lite::BEGIN@18 at line 200 of NetAddr/IP/Util.pm
# once (8µs+0s) by Net::DNS::RR::BEGIN@43 at line 72 of Net/DNS/DomainName.pm
# once (7µs+0s) by NetAddr::IP::Lite::BEGIN@9 at line 256 of NetAddr/IP/InetBase.pm
# once (7µs+0s) by NetAddr::IP::Lite::BEGIN@9 at line 244 of NetAddr/IP/InetBase.pm
# once (6µs+0s) by Net::DNS::Resolver::Base::BEGIN@57 at line 763 of Net/DNS/Packet.pm
# once (5µs+0s) by NetAddr::IP::BEGIN@8 at line 1420 of NetAddr/IP/Lite.pm
# once (5µs+0s) by NetAddr::IP::BEGIN@8 at line 416 of NetAddr/IP/Lite.pm
# once (4µs+0s) by Net::DNS::RR::OPT::CLIENT_SUBNET::BEGIN@240 at line 52 of Net/DNS/RR/A.pm
# once (4µs+0s) by NetAddr::IP::BEGIN@8 at line 683 of NetAddr/IP/Lite.pm
# once (4µs+0s) by NetAddr::IP::BEGIN@8 at line 206 of NetAddr/IP/Lite.pm
# once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 202 of NetAddr/IP/Lite.pm
# once (3µs+0s) by NetAddr::IP::Lite::BEGIN@18 at line 201 of NetAddr/IP/Util.pm
# once (3µs+0s) by Net::DNS::RR::BEGIN@43 at line 213 of Net/DNS/DomainName.pm
# once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 685 of NetAddr/IP/Lite.pm
# once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 684 of NetAddr/IP/Lite.pm
# once (3µs+0s) by NetAddr::IP::Lite::BEGIN@9 at line 245 of NetAddr/IP/InetBase.pm
# once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 204 of NetAddr/IP/Lite.pm | |||||
# spent 32µs within main::CORE:print which was called:
# once (32µs+0s) by main::RUNTIME at line 485 | |||||
# spent 569µs within mro::method_changed_in which was called 147 times, avg 4µs/call:
# 147 times (569µs+0s) by constant::import at line 198 of constant.pm, avg 4µs/call | |||||
# spent 109µs within utf8::encode which was called 26 times, avg 4µs/call:
# 24 times (98µs+0s) by base::__ANON__[/usr/local/lib/perl5/5.24/base.pm:77] at line 75 of base.pm, avg 4µs/call
# once (8µs+0s) by Pod::Simple::LinkSection::BEGIN@9 at line 41 of Pod/Simple/BlackBox.pm
# once (3µs+0s) by Encode::encode_utf8 at line 231 of Encode.pm | |||||
# spent 17.0ms within utf8::is_utf8 which was called 3936 times, avg 4µs/call:
# 1968 times (9.24ms+0s) by Mail::SpamAssassin::DnsResolver::new_dns_packet at line 549 of Mail/SpamAssassin/DnsResolver.pm, avg 5µs/call
# 1968 times (7.72ms+0s) by Mail::SpamAssassin::Util::decode_dns_question_entry at line 940 of Mail/SpamAssassin/Util.pm, avg 4µs/call |