Profile of /usr/local/bin/sa-learn

Filename	/usr/local/bin/sa-learn
Statements	Executed 4827 statements in 65.7ms

Subroutines
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
234	1	1	31.1ms	715s	main::::wanted main::wanted
1	1	1	20.7ms	39.0ms	main::::BEGIN@24 main::BEGIN@24
1	1	1	18.7ms	670ms	main::::BEGIN@65 main::BEGIN@65
3936	2	2	17.0ms	17.0ms	utf8::::is_utf8 utf8::is_utf8 (xsub)
3017	12	9	16.7ms	16.7ms	UNIVERSAL::::can UNIVERSAL::can (xsub)
1	1	1	11.8ms	17.4ms	main::::BEGIN@66 main::BEGIN@66
1038	1	1	9.15ms	9.15ms	Encode::XS::::decodeEncode::XS::decode (xsub)
1	1	1	6.25ms	6.30ms	main::::BEGIN@20 main::BEGIN@20
1	1	1	5.67ms	105ms	main::::BEGIN@25 main::BEGIN@25
234	1	1	4.17ms	4.18ms	main::::result main::result
1	1	1	3.38ms	10.1ms	main::::BEGIN@23 main::BEGIN@23
1	1	1	2.65ms	8.03ms	main::::BEGIN@69 main::BEGIN@69
1	1	1	1.45ms	4.49ms	main::::BEGIN@68 main::BEGIN@68
1	1	1	1.33ms	1.87ms	main::::BEGIN@39 main::BEGIN@39
1	1	1	1.15ms	1.33ms	main::::BEGIN@19 main::BEGIN@19
148	3	1	970µs	970µs	Internals::::SvREADONLY Internals::SvREADONLY (xsub)
1	1	1	612µs	628µs	main::::BEGIN@21 main::BEGIN@21
147	1	1	569µs	569µs	mro::::method_changed_in mro::method_changed_in (xsub)
57	4	4	280µs	280µs	UNIVERSAL::::isa UNIVERSAL::isa (xsub)
24	23	10	142µs	142µs	main::::CORE:pack main::CORE:pack (opcode)
6	6	5	127µs	127µs	UNIVERSAL::::VERSION UNIVERSAL::VERSION (xsub)
26	3	3	109µs	109µs	utf8::::encode utf8::encode (xsub)
2	1	1	56µs	56µs	main::::target main::target
1	1	1	56µs	170µs	main::::BEGIN@41 main::BEGIN@41
2	1	1	47µs	47µs	main::::CORE:ftis main::CORE:ftis (opcode)
1	1	1	32µs	32µs	main::::CORE:print main::CORE:print (opcode)
4	2	1	24µs	24µs	main::::CORE:match main::CORE:match (opcode)
1	1	1	24µs	212µs	main::::BEGIN@70 main::BEGIN@70
1	1	1	22µs	586µs	main::::BEGIN@28 main::BEGIN@28
1	1	1	21µs	21µs	main::::BEGIN@67 main::BEGIN@67
1	1	1	20µs	20µs	main::::BEGIN@26 main::BEGIN@26
2	2	1	12µs	12µs	main::::CORE:close main::CORE:close (opcode)
1	1	1	10µs	10µs	main::::init_results main::init_results
1	1	1	5µs	5µs	main::::__ANON__[:94] main::__ANON__[:94]
0	0	0	0s	0s	main::::RUNTIME main::RUNTIME
0	0	0	0s	0s	main::::__ANON__[:112] main::__ANON__[:112]
0	0	0	0s	0s	main::::__ANON__[:130] main::__ANON__[:130]
0	0	0	0s	0s	main::::__ANON__[:131] main::__ANON__[:131]
0	0	0	0s	0s	main::::__ANON__[:132] main::__ANON__[:132]
0	0	0	0s	0s	main::::__ANON__[:133] main::__ANON__[:133]
0	0	0	0s	0s	main::::__ANON__[:134] main::__ANON__[:134]
0	0	0	0s	0s	main::::__ANON__[:93] main::__ANON__[:93]
0	0	0	0s	0s	main::::__ANON__[:96] main::__ANON__[:96]
0	0	0	0s	0s	main::::killed main::killed
0	0	0	0s	0s	main::::usage main::usage

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
0			1	74µs	Profile data that couldn't be associated with a specific line: # spent 74µs making 1 call to Mail::SpamAssassin::Logger::END
1					#!/usr/local/bin/perl -T -w
2					# <@LICENSE>
3					# Licensed to the Apache Software Foundation (ASF) under one or more
4					# contributor license agreements. See the NOTICE file distributed with
5					# this work for additional information regarding copyright ownership.
6					# The ASF licenses this file to you under the Apache License, Version 2.0
7					# (the "License"); you may not use this file except in compliance with
8					# the License. You may obtain a copy of the License at:
9					#
10					# http://www.apache.org/licenses/LICENSE-2.0
11					#
12					# Unless required by applicable law or agreed to in writing, software
13					# distributed under the License is distributed on an "AS IS" BASIS,
14					# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15					# See the License for the specific language governing permissions and
16					# limitations under the License.
17					# </@LICENSE>
18
19	2	714µs	2	1.35ms	# spent 1.33ms (1.15+186µs) within main::BEGIN@19 which was called: # once (1.15ms+186µs) by main::NULL at line 19 use strict; # spent 1.33ms making 1 call to main::BEGIN@19 # spent 12µs making 1 call to strict::import
20	2	6.02ms	2	6.34ms	# spent 6.30ms (6.25+53µs) within main::BEGIN@20 which was called: # once (6.25ms+53µs) by main::NULL at line 20 use warnings; # spent 6.30ms making 1 call to main::BEGIN@20 # spent 42µs making 1 call to warnings::import
21	2	622µs	2	644µs	# spent 628µs (612+16) within main::BEGIN@21 which was called: # once (612µs+16µs) by main::NULL at line 21 use bytes; # spent 628µs making 1 call to main::BEGIN@21 # spent 16µs making 1 call to bytes::import
22
23	2	318µs	2	11.5ms	# spent 10.1ms (3.38+6.74) within main::BEGIN@23 which was called: # once (3.38ms+6.74ms) by main::NULL at line 23 use Errno qw(EBADF); # spent 10.1ms making 1 call to main::BEGIN@23 # spent 1.37ms making 1 call to Exporter::import
24	2	368µs	2	43.1ms	# spent 39.0ms (20.7+18.3) within main::BEGIN@24 which was called: # once (20.7ms+18.3ms) by main::NULL at line 24 use Getopt::Long; # spent 39.0ms making 1 call to main::BEGIN@24 # spent 4.08ms making 1 call to Getopt::Long::import
25	2	389µs	2	105ms	# spent 105ms (5.67+98.9) within main::BEGIN@25 which was called: # once (5.67ms+98.9ms) by main::NULL at line 25 use Pod::Usage; # spent 105ms making 1 call to main::BEGIN@25 # spent 352µs making 1 call to Exporter::import
26	2	89µs	1	20µs	# spent 20µs within main::BEGIN@26 which was called: # once (20µs+0s) by main::NULL at line 26 use File::Spec; # spent 20µs making 1 call to main::BEGIN@26
27
28	1	2µs			# spent 586µs (22+564) within main::BEGIN@28 which was called: # once (22µs+564µs) by main::NULL at line 33 use vars qw(
29					$spamtest %opt $isspam $forget
30					$messagecount $learnedcount $messagelimit
31					$progress $total_messages $init_results $start_time
32					$synconly $learnprob @targets $bayes_override_path
33	1	97µs	2	1.15ms	); # spent 586µs making 1 call to main::BEGIN@28 # spent 564µs making 1 call to vars::import
34
35	1	4µs			my $PREFIX = '/usr/local'; # substituted at 'make' time
36	1	2µs			my $DEF_RULES_DIR = '/usr/local/share/spamassassin'; # substituted at 'make' time
37	1	2µs			my $LOCAL_RULES_DIR = '/usr/local/etc/mail/spamassassin'; # substituted at 'make' time
38
39	2	620µs	2	2.26ms	# spent 1.87ms (1.33+546µs) within main::BEGIN@39 which was called: # once (1.33ms+546µs) by main::NULL at line 39 use lib '/usr/local/lib/perl5/site_perl'; # substituted at 'make' time # spent 1.87ms making 1 call to main::BEGIN@39 # spent 386µs making 1 call to lib::import
40
41					# spent 170µs (56+115) within main::BEGIN@41 which was called: # once (56µs+115µs) by main::NULL at line 63 BEGIN { # see comments in "spamassassin.raw" for doco
42	1	21µs	1	68µs	my @bin = File::Spec->splitpath($0); # spent 68µs making 1 call to File::Spec::Unix::splitpath
43	1	2µs			my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1])
44					\|\| File::Spec->curdir;
45
46	1	72µs	2	47µs	if (-e $bin.'/lib/Mail/SpamAssassin.pm' # spent 47µs making 2 calls to main::CORE:ftis, avg 24µs/call
47					\|\| !-e '/usr/local/lib/perl5/site_perl/Mail/SpamAssassin.pm' )
48					{
49					my $searchrelative;
50					if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm')
51					{
52					unshift ( @INC, '../blib/lib' );
53					} else {
54					foreach ( qw(lib ../lib/site_perl
55					../lib/spamassassin ../share/spamassassin/lib))
56					{
57					my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) );
58					if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) )
59					{ unshift ( @INC, $dir ); last; }
60					}
61					}
62					}
63	1	75µs	1	170µs	} # spent 170µs making 1 call to main::BEGIN@41
64
65	2	433µs	1	670ms	# spent 670ms (18.7+651) within main::BEGIN@65 which was called: # once (18.7ms+651ms) by main::NULL at line 65 use Mail::SpamAssassin; # spent 670ms making 1 call to main::BEGIN@65
66	2	357µs	1	17.4ms	# spent 17.4ms (11.8+5.62) within main::BEGIN@66 which was called: # once (11.8ms+5.62ms) by main::NULL at line 66 use Mail::SpamAssassin::ArchiveIterator; # spent 17.4ms making 1 call to main::BEGIN@66
67	2	62µs	1	21µs	# spent 21µs within main::BEGIN@67 which was called: # once (21µs+0s) by main::NULL at line 67 use Mail::SpamAssassin::Message; # spent 21µs making 1 call to main::BEGIN@67
68	2	390µs	1	4.49ms	# spent 4.49ms (1.45+3.03) within main::BEGIN@68 which was called: # once (1.45ms+3.03ms) by main::NULL at line 68 use Mail::SpamAssassin::PerMsgLearner; # spent 4.49ms making 1 call to main::BEGIN@68
69	2	362µs	1	8.03ms	# spent 8.03ms (2.65+5.38) within main::BEGIN@69 which was called: # once (2.65ms+5.38ms) by main::NULL at line 69 use Mail::SpamAssassin::Util::Progress; # spent 8.03ms making 1 call to main::BEGIN@69
70	2	9.90ms	2	400µs	# spent 212µs (24+188) within main::BEGIN@70 which was called: # once (24µs+188µs) by main::NULL at line 70 use Mail::SpamAssassin::Logger; # spent 212µs making 1 call to main::BEGIN@70 # spent 188µs making 1 call to Exporter::import
71
72					###########################################################################
73
74	1	79µs			$SIG{PIPE} = 'IGNORE';
75
76					# used to be CmdLearn::cmd_run() ...
77
78	1	11µs			%opt = (
79					'force-expire' => 0,
80					'use-ignores' => 0,
81					'nosync' => 0,
82					'quiet' => 0,
83					'cf' => []
84					);
85
86	1	16µs	1	318µs	Getopt::Long::Configure( # spent 318µs making 1 call to Getopt::Long::Configure
87					qw(bundling no_getopt_compat
88					permute no_auto_abbrev no_ignore_case)
89					);
90
91					GetOptions(
92					'forget' => \$forget,
93					'ham\|nonspam' => sub { $isspam = 0; },
94	1	9µs			# spent 5µs within main::__ANON__[/usr/local/bin/sa-learn:94] which was called: # once (5µs+0s) by Getopt::Long::GetOptionsFromArray at line 605 of Getopt/Long.pm 'spam' => sub { $isspam = 1; },
95					'sync' => \$synconly,
96					'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" },
97
98					'q\|quiet' => \$opt{'quiet'},
99					'username\|u=s' => \$opt{'username'},
100					'configpath\|config-file\|config-dir\|c\|C=s' => \$opt{'configpath'},
101					'prefspath\|prefs-file\|p=s' => \$opt{'prefspath'},
102					'siteconfigpath=s' => \$opt{'siteconfigpath'},
103	1	4µs			'cf=s' => \@{$opt{'cf'}},
104
105					'folders\|f=s' => \$opt{'folders'},
106					'force-expire\|expire' => \$opt{'force-expire'},
107					'local\|L' => \$opt{'local'},
108					'no-sync\|nosync' => \$opt{'nosync'},
109					'showdots' => \$opt{'showdots'},
110					'progress' => \$opt{'progress'},
111					'use-ignores' => \$opt{'use-ignores'},
112					'no-rebuild\|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" },
113
114					'learnprob=f' => \$opt{'learnprob'},
115					'randseed=i' => \$opt{'randseed'},
116					'stopafter=i' => \$opt{'stopafter'},
117					'max-size=i' => \$opt{'max-size'},
118
119					'debug\|debug-level\|D:s' => \$opt{'debug'},
120					'help\|h\|?' => \$opt{'help'},
121					'version\|V' => \$opt{'version'},
122
123					'dump:s' => \$opt{'dump'},
124					'import' => \$opt{'import'},
125
126					'backup' => \$opt{'backup'},
127					'clear' => \$opt{'clear'},
128					'restore=s' => \$opt{'restore'},
129
130					'dir' => sub { $opt{'old_format'} = 'dir'; },
131					'file' => sub { $opt{'old_format'} = 'file'; },
132					'mbox' => sub { $opt{'format'} = 'mbox'; },
133					'mbx' => sub { $opt{'format'} = 'mbx'; },
134					'single' => sub { $opt{'old_format'} = 'single'; },
135
136					'db\|dbpath=s' => \$bayes_override_path,
137	1	83µs	1	29µs	're\|regexp=s' => \$opt{'regexp'}, # spent 29µs making 1 call to Getopt::Long::GetOptions
138
139					'<>' => \&target,
140					)
141					or usage( 0, "Unknown option!" );
142
143	1	3µs			if ( defined $opt{'help'} ) {
144					usage( 0, "For more information read the manual page" );
145					}
146	1	2µs			if ( defined $opt{'version'} ) {
147					print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
148					exit 0;
149					}
150
151					# set debug areas, if any specified (only useful for command-line tools)
152	1	2µs			if (defined $opt{'debug'}) {
153					$opt{'debug'} \|\|= 'all';
154					}
155
156	1	2µs			if ( $opt{'force-expire'} ) {
157					$synconly = 1;
158					}
159
160	1	2µs			if ($opt{'showdots'} && $opt{'progress'}) {
161					print "--showdots and --progress may not be used together, please select just one\n";
162					exit 0;
163					}
164
165	1	2µs			if ( !defined $isspam
166					&& !defined $synconly
167					&& !defined $forget
168					&& !defined $opt{'dump'}
169					&& !defined $opt{'import'}
170					&& !defined $opt{'clear'}
171					&& !defined $opt{'backup'}
172					&& !defined $opt{'restore'}
173					&& !defined $opt{'folders'} )
174					{
175					usage( 0,
176					"Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore"
177					);
178					}
179
180					# We need to make sure the journal syncs pre-forget...
181	1	2µs			if ( defined $forget && $opt{'nosync'} ) {
182					$opt{'nosync'} = 0;
183					warn
184					"sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n";
185					}
186
187	1	2µs			if ( defined $opt{'old_format'} ) {
188
189					#Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single.
190					#Convert it to the new behavior:
191					if ( $opt{'old_format'} eq 'single' ) {
192					push ( @ARGV, '-' );
193					}
194					}
195
196	1	3µs			my $post_config = '';
197
198					# kluge to support old check_bayes_db operation
199					# bug 3799: init() will go r/o with the configured DB, and then dbpath needs
200					# to override. Just access the dbpath version via post_config_text.
201	1	2µs			if ( defined $bayes_override_path ) {
202					# Add a default prefix if the path is a directory
203					if ( -d $bayes_override_path ) {
204					$bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' );
205					}
206
207					$post_config .= "bayes_path $bayes_override_path\n";
208					}
209
210					# These options require bayes_scanner, which requires "use_bayes 1", but
211					# that's not necessary for these commands.
212	1	6µs			if (defined $opt{'dump'} \|\| defined $opt{'import'} \|\| defined $opt{'clear'} \|\|
213					defined $opt{'backup'} \|\| defined $opt{'restore'}) {
214					$post_config .= "use_bayes 1\n";
215					}
216
217	2	11µs			$post_config .= join("\n", @{$opt{'cf'}})."\n";
218
219					# create the tester factory
220					$spamtest = new Mail::SpamAssassin(
221					{
222					rules_filename => $opt{'configpath'},
223					site_rules_filename => $opt{'siteconfigpath'},
224					userprefs_filename => $opt{'prefspath'},
225					username => $opt{'username'},
226					debug => $opt{'debug'},
227	1	32µs	1	48.7ms	local_tests_only => $opt{'local'}, # spent 48.7ms making 1 call to Mail::SpamAssassin::new
228					dont_copy_prefs => 1,
229					PREFIX => $PREFIX,
230					DEF_RULES_DIR => $DEF_RULES_DIR,
231					LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
232					post_config_text => $post_config,
233					}
234					);
235
236	1	11µs	1	10.7s	$spamtest->init(1); # spent 10.7s making 1 call to Mail::SpamAssassin::init
237	1	8µs	1	7µs	dbg("sa-learn: spamtest initialized"); # spent 7µs making 1 call to Mail::SpamAssassin::Logger::dbg
238
239					# Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin;
240					# To be resolved more cleanly!!!
241	1	9µs			if ($spamtest->{bayes_scanner}) {
242	2	11µs			foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) {
243	27	476µs	27	120µs	if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) { # spent 120µs making 27 calls to UNIVERSAL::isa, avg 4µs/call
244					# copy plugin's "store" object ref one level up!
245	1	4µs			$spamtest->{bayes_scanner}->{store} = $plugin->{store};
246					}
247					}
248					}
249
250	1	9µs	1	22µs	if (Mail::SpamAssassin::Util::am_running_on_windows()) { # spent 22µs making 1 call to Mail::SpamAssassin::Util::am_running_on_windows
251					binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363
252					binmode(STDOUT) or die "cannot set binmode on STDOUT: $!";
253					}
254
255	1	4µs			if ( defined $opt{'dump'} ) {
256					my ( $magic, $toks );
257
258					if ( $opt{'dump'} eq 'all' \|\| $opt{'dump'} eq '' ) { # show us all tokens!
259					( $magic, $toks ) = ( 1, 1 );
260					}
261					elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only
262					( $magic, $toks ) = ( 1, 0 );
263					}
264					elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only
265					( $magic, $toks ) = ( 0, 1 );
266					}
267					else { # unknown option
268					warn "Unknown dump option '" . $opt{'dump'} . "'\n";
269					$spamtest->finish_learner();
270					exit 1;
271					}
272
273					if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) {
274					$spamtest->finish_learner();
275					die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n";
276					}
277
278					$spamtest->finish_learner();
279					# make sure we notice any write errors while flushing output buffer
280					close STDOUT or die "error closing STDOUT: $!";
281					close STDIN or die "error closing STDIN: $!";
282					exit 0;
283					}
284
285	1	3µs			if ( defined $opt{'import'} ) {
286					my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
287					$spamtest->finish_learner();
288					# make sure we notice any write errors while flushing output buffer
289					close STDOUT or die "error closing STDOUT: $!";
290					close STDIN or die "error closing STDIN: $!";
291					exit( !$ret );
292					}
293
294	1	3µs			if (defined $opt{'clear'}) {
295					unless ($spamtest->{bayes_scanner}->{store}->clear_database()) {
296					$spamtest->finish_learner();
297					die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n";
298					}
299
300					$spamtest->finish_learner();
301					# make sure we notice any write errors while flushing output buffer
302					close STDOUT or die "error closing STDOUT: $!";
303					close STDIN or die "error closing STDIN: $!";
304					exit 0;
305					}
306
307	1	2µs			if (defined $opt{'backup'}) {
308					unless ($spamtest->{bayes_scanner}->{store}->backup_database()) {
309					$spamtest->finish_learner();
310					die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n";
311					}
312
313					$spamtest->finish_learner();
314					# make sure we notice any write errors while flushing output buffer
315					close STDOUT or die "error closing STDOUT: $!";
316					close STDIN or die "error closing STDIN: $!";
317					exit 0;
318					}
319
320	1	3µs			if (defined $opt{'restore'}) {
321
322					my $filename = $opt{'restore'};
323
324					unless ($filename) {
325					$spamtest->finish_learner();
326					die "ERROR: You must specify a filename to restore.\n";
327					}
328
329					unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) {
330					$spamtest->finish_learner();
331					die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n";
332					}
333
334					$spamtest->finish_learner();
335					# make sure we notice any write errors while flushing output buffer
336					close STDOUT or die "error closing STDOUT: $!";
337					close STDIN or die "error closing STDIN: $!";
338					exit 0;
339					}
340
341	1	4µs			if ( !$spamtest->{conf}->{use_bayes} ) {
342					warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n";
343					exit 1;
344					}
345
346					$spamtest->init_learner(
347					{
348					force_expire => $opt{'force-expire'},
349	1	23µs	1	201µs	learn_to_journal => $opt{'nosync'}, # spent 201µs making 1 call to Mail::SpamAssassin::init_learner
350					wait_for_lock => 1,
351					caller_will_untie => 1
352					}
353					);
354
355	1	5µs			$spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'};
356
357	1	2µs			if ($synconly) {
358					$spamtest->rebuild_learner_caches(
359					{
360					verbose => !$opt{'quiet'},
361					showdots => $opt{'showdots'}
362					}
363					);
364					$spamtest->finish_learner();
365					# make sure we notice any write errors while flushing output buffer
366					close STDOUT or die "error closing STDOUT: $!";
367					close STDIN or die "error closing STDIN: $!";
368					exit 0;
369					}
370
371	1	4µs			$messagelimit = $opt{'stopafter'};
372	1	3µs			$learnprob = $opt{'learnprob'};
373
374	1	3µs			if ( defined $opt{'randseed'} ) {
375					srand( $opt{'randseed'} );
376					}
377
378					# sync the journal first if we're going to go r/w so we make sure to
379					# learn everything before doing anything else.
380					#
381	1	2µs			if ( !$opt{nosync} ) {
382					$spamtest->rebuild_learner_caches();
383					}
384
385					# what is the result of the run? will end up being the exit code.
386	1	3µs			my $exit_status = 0;
387
388					# run this lot in an eval block, so we can catch die's and clear
389					# up the dbs.
390					eval {
391	1	18µs			$SIG{HUP} = \&killed;
392	1	7µs			$SIG{INT} = \&killed;
393	1	8µs			$SIG{TERM} = \&killed;
394
395	1	2µs			if ( $opt{folders} ) {
396					open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!";
397					for ($!=0; <F>; $!=0) {
398					chomp;
399					next if /^\s*$/;
400					if (/^(?:ham\|spam):\w*:/) {
401					push ( @targets, $_ );
402					}
403					else {
404					target($_);
405					}
406					}
407					defined $_ \|\| $!==0 or
408					$!==EBADF ? dbg("error reading from $opt{folders}: $!")
409					: die "error reading from $opt{folders}: $!";
410					close(F) or die "error closing $opt{folders}: $!";
411					}
412
413					###########################################################################
414					# Deal with the target listing, and STDIN -> tempfile
415
416	1	2µs			my $tempfile; # will be defined if stdin -> tempfile
417	1	4µs			push(@targets, @ARGV);
418	1	2µs			@targets = ('-') unless @targets \|\| $opt{folders};
419
420	1	19µs			for(my $elem = 0; $elem <= $#targets; $elem++) {
421					# ArchiveIterator doesn't really like STDIN, so if "-" is specified
422					# as a target, make it a temp file instead.
423	2	32µs	2	13µs	if ( $targets[$elem] =~ /(?:^\|:)-$/ ) { # spent 13µs making 2 calls to main::CORE:match, avg 7µs/call
424					if (defined $tempfile) {
425					# uh-oh, stdin specified multiple times?
426					warn "skipping extra stdin target (".$targets[$elem].")\n";
427					splice @targets, $elem, 1;
428					$elem--; # go back to this element again
429					next;
430					}
431					else {
432					my $handle;
433					( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile();
434					binmode $handle or die "cannot set binmode on file $tempfile: $!";
435
436					# avoid slurping the whole file into memory, copy chunk by chunk
437					my($inbuf,$nread);
438					while ( $nread=sysread(STDIN,$inbuf,16384) )
439					{ print {$handle} $inbuf or die "error writing to $tempfile: $!" }
440					defined $nread or die "error reading from STDIN: $!";
441					close $handle or die "error closing $tempfile: $!";
442
443					# re-aim the targets at the tempfile instead of STDIN
444					$targets[$elem] =~ s/-$/$tempfile/;
445					}
446					}
447
448					# make sure the target list is in the normal AI format
449	2	32µs	2	11µs	if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) { # spent 11µs making 2 calls to main::CORE:match, avg 6µs/call
450					my $item = splice @targets, $elem, 1;
451					target($item); # add back to the list
452					$elem--; # go back to this element again
453					next;
454					}
455					}
456
457					###########################################################################
458
459					my $iter = new Mail::SpamAssassin::ArchiveIterator(
460					{
461					# skip messages larger than max-size bytes,
462					# 0 for no limit, undef defaults to 256 KB
463					'opt_max_size' => $opt{'max-size'},
464					'opt_want_date' => 0,
465					'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex},
466					}
467	1	30µs	1	47µs	); # spent 47µs making 1 call to Mail::SpamAssassin::ArchiveIterator::new
468
469	1	12µs	1	15µs	$iter->set_functions(\&wanted, \&result); # spent 15µs making 1 call to Mail::SpamAssassin::ArchiveIterator::set_functions
470	1	3µs			$messagecount = 0;
471	1	2µs			$learnedcount = 0;
472
473	1	2µs			$init_results = 0;
474	1	4µs			$start_time = time;
475
476					# if exit_status isn't already set to non-zero, set it to the reverse of the
477					# run result (0 is bad, 1+ is good -- the opposite of exit status codes)
478	3	19µs	1	715s	my $run_ok = eval { $exit_status \|\|= ! $iter->run(@targets); 1 }; # spent 715s making 1 call to Mail::SpamAssassin::ArchiveIterator::run
479
480	1	3µs			print STDERR "\n" if ($opt{showdots});
481	1	2µs			$progress->final() if ($opt{progress} && $progress);
482
483	1	4µs			my $phrase = defined $forget ? "Forgot" : "Learned";
484					print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n"
485	1	54µs	1	32µs	if !$opt{'quiet'}; # spent 32µs making 1 call to main::CORE:print
486
487					# If we needed to make a tempfile, go delete it.
488	1	2µs			if (defined $tempfile) {
489					unlink $tempfile or die "cannot unlink temporary file $tempfile: $!";
490					undef $tempfile;
491					}
492
493	1	2µs			if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ }
494	1	29µs			1;
495	1	5µs			} or do {
496					my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
497					$spamtest->finish_learner();
498					die $eval_stat;
499					};
500
501	1	11µs	1	5.67s	$spamtest->finish_learner(); # spent 5.67s making 1 call to Mail::SpamAssassin::finish_learner
502					# make sure we notice any write errors while flushing output buffer
503	1	26µs	1	8µs	close STDOUT or die "error closing STDOUT: $!"; # spent 8µs making 1 call to main::CORE:close
504	1	13µs	1	4µs	close STDIN or die "error closing STDIN: $!"; # spent 4µs making 1 call to main::CORE:close
505	1	87µs			exit $exit_status;
506
507					###########################################################################
508
509					sub killed {
510					$spamtest->finish_learner();
511					die "interrupted";
512					}
513
514					# spent 56µs within main::target which was called 2 times, avg 28µs/call: # 2 times (56µs+0s) by Getopt::Long::GetOptionsFromArray at line 737 of Getopt/Long.pm, avg 28µs/call sub target {
515	2	5µs			my ($target) = @_;
516
517	2	14µs			my $class = ( $isspam ? "spam" : "ham" );
518	2	5µs			my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" );
519
520	2	10.1ms			push ( @targets, "$class:$format:$target" );
521					}
522
523					###########################################################################
524
525					# spent 10µs within main::init_results which was called: # once (10µs+0s) by main::result at line 541 sub init_results {
526	1	2µs			$init_results = 1;
527
528	1	10µs			return unless $opt{'progress'};
529
530					$total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES;
531
532					$progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,});
533					}
534
535					###########################################################################
536
537					# spent 4.18ms (4.17+10µs) within main::result which was called 234 times, avg 18µs/call: # 234 times (4.17ms+10µs) by Mail::SpamAssassin::ArchiveIterator::_run at line 326 of Mail/SpamAssassin/ArchiveIterator.pm, avg 18µs/call sub result {
538	234	1.34ms			my ($class, $result, $time) = @_;
539
540					# don't open results files until we get here to avoid overwriting files
541	234	638µs	1	10µs	&init_results if !$init_results; # spent 10µs making 1 call to main::init_results
542
543	234	2.27ms			$progress->update($messagecount) if ($opt{progress} && $progress);
544					}
545
546					###########################################################################
547
548					# spent 715s (31.1ms+715) within main::wanted which was called 234 times, avg 3.05s/call: # 234 times (31.1ms+715s) by Mail::SpamAssassin::ArchiveIterator::_run_file at line 414 of Mail/SpamAssassin/ArchiveIterator.pm, avg 3.05s/call sub wanted {
549	234	2.05ms			my ( $class, $id, $time, $dataref ) = @_;
550
551	234	860µs			my $spam = $class eq "s" ? 1 : 0;
552
553	234	578µs			if ( defined($learnprob) ) {
554					if ( int( rand( 1 / $learnprob ) ) != 0 ) {
555					print STDERR '_' if ( $opt{showdots} );
556					return 1;
557					}
558					}
559
560	234	505µs			if ( defined($messagelimit) && $learnedcount > $messagelimit ) {
561					$progress->final() if ($opt{progress} && $progress);
562					die 'HITLIMIT';
563					}
564
565	234	530µs			$messagecount++;
566	234	2.80ms	234	4.74s	my $ma = $spamtest->parse($dataref); # spent 4.74s making 234 calls to Mail::SpamAssassin::parse, avg 20.2ms/call
567
568	234	2.29ms	234	18.6ms	if ( $ma->get_header("X-Spam-Checker-Version") ) { # spent 18.6ms making 234 calls to Mail::SpamAssassin::Message::Node::get_header, avg 80µs/call
569					my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
570					$ma->finish();
571					$ma = $new_ma;
572					}
573
574	234	2.45ms	234	710s	my $status = $spamtest->learn( $ma, undef, $spam, $forget ); # spent 710s making 234 calls to Mail::SpamAssassin::learn, avg 3.03s/call
575	234	2.51ms	234	2.29ms	my $learned = $status->did_learn(); # spent 2.29ms making 234 calls to Mail::SpamAssassin::PerMsgLearner::did_learn, avg 10µs/call
576
577	234	1.21ms			if ( !defined $learned ) { # undef=learning unavailable
578					die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n";
579					}
580					elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned
581	234	598µs			$learnedcount++;
582					}
583
584					# Do cleanup ...
585	234	2.04ms	234	3.59ms	$status->finish(); # spent 3.59ms making 234 calls to Mail::SpamAssassin::PerMsgLearner::finish, avg 15µs/call
586	234	937µs			undef $status;
587
588	234	2.21ms	234	124ms	$ma->finish(); # spent 124ms making 234 calls to Mail::SpamAssassin::Message::finish, avg 530µs/call
589	234	3.50ms	79	1.45ms	undef $ma; # spent 1.45ms making 79 calls to Mail::SpamAssassin::Message::DESTROY, avg 18µs/call
590
591	234	946µs			print STDERR '.' if ( $opt{showdots} );
592	234	3.08ms			return 1;
593					}
594
595					###########################################################################
596
597					sub usage {
598					my ( $verbose, $message ) = @_;
599					my $ver = Mail::SpamAssassin::Version();
600					print "SpamAssassin version $ver\n";
601					pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 );
602					}
603
604					# ---------------------------------------------------------------------------
605
606					=head1 NAME
607
608					sa-learn - train SpamAssassin's Bayesian classifier
609
610					=head1 SYNOPSIS
611
612					B<sa-learn> [options] [file]...
613
614					B<sa-learn> [options] --dump [ all \| data \| magic ]
615
616					Options:
617
618					--ham Learn messages as ham (non-spam)
619					--spam Learn messages as spam
620					--forget Forget a message
621					--use-ignores Use bayes_ignore_from and bayes_ignore_to
622					--sync Synchronize the database and the journal if needed
623					--force-expire Force a database sync and expiry run
624					--dbpath <path> Allows commandline override (in bayes_path form)
625					for where to read the Bayes DB from
626					--dump [all\|data\|magic] Display the contents of the Bayes database
627					Takes optional argument for what to display
628					--regexp <re> For dump only, specifies which tokens to
629					dump based on a regular expression.
630					-f file, --folders=file Read list of files/directories from file
631					--dir Ignored; historical compatibility
632					--file Ignored; historical compatibility
633					--mbox Input sources are in mbox format
634					--mbx Input sources are in mbx format
635					--max-size <b> Skip messages larger than b bytes;
636					defaults to 256 KB, 0 implies no limit
637					--showdots Show progress using dots
638					--progress Show progress using progress bar
639					--no-sync Skip synchronizing the database and journal
640					after learning
641					-L, --local Operate locally, no network accesses
642					--import Migrate data from older version/non DB_File
643					based databases
644					--clear Wipe out existing database
645					--backup Backup, to STDOUT, existing database
646					--restore <filename> Restore a database from filename
647					-u username, --username=username
648					Override username taken from the runtime
649					environment, used with SQL
650					-C path, --configpath=path, --config-file=path
651					Path to standard configuration dir
652					-p prefs, --prefspath=file, --prefs-file=file
653					Set user preferences file
654					--siteconfigpath=path Path for site configs
655					(default: /etc/mail/spamassassin)
656					--cf='config line' Additional line of configuration
657					-D, --debug [area=n,...] Print debugging messages
658					-V, --version Print version
659					-h, --help Print usage message
660
661					=head1 DESCRIPTION
662
663					Given a typical selection of your incoming mail classified as spam or ham
664					(non-spam), this tool will feed each mail to SpamAssassin, allowing it
665					to 'learn' what signs are likely to mean spam, and which are likely to
666					mean ham.
667
668					Simply run this command once for each of your mail folders, and it will
669					''learn'' from the mail therein.
670
671					Note that csh-style I<globbing> in the mail folder names is supported;
672					in other words, listing a folder name as C<*> will scan every folder
673					that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details.
674
675					SpamAssassin remembers which mail messages it has learnt already, and will not
676					re-learn those messages again, unless you use the B<--forget> option. Messages
677					learnt as spam will have SpamAssassin markup removed, on the fly.
678
679					If you make a mistake and scan a mail as ham when it is spam, or vice
680					versa, simply rerun this command with the correct classification, and the
681					mistake will be corrected. SpamAssassin will automatically 'forget' the
682					previous indications.
683
684					Users of C<spamd> who wish to perform training remotely, over a network,
685					should investigate the C<spamc -L> switch.
686
687					=head1 OPTIONS
688
689					=over 4
690
691					=item B<--ham>
692
693					Learn the input message(s) as ham. If you have previously learnt any of the
694					messages as spam, SpamAssassin will forget them first, then re-learn them as
695					ham. Alternatively, if you have previously learnt them as ham, it'll skip them
696					this time around. If the messages have already been filtered through
697					SpamAssassin, the learner will ignore any modifications SpamAssassin may have
698					made.
699
700					=item B<--spam>
701
702					Learn the input message(s) as spam. If you have previously learnt any of the
703					messages as ham, SpamAssassin will forget them first, then re-learn them as
704					spam. Alternatively, if you have previously learnt them as spam, it'll skip
705					them this time around. If the messages have already been filtered through
706					SpamAssassin, the learner will ignore any modifications SpamAssassin may have
707					made.
708
709					=item B<--folders>=I<filename>, B<-f> I<filename>
710
711					sa-learn will read in the list of folders from the specified file, one folder
712					per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>,
713					sa-learn will learn that folder appropriately, otherwise the folders will be
714					assumed to be of the type specified by B<--ham> or B<--spam>.
715
716					C<type> above is optional, but is the same as the standard for
717					ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not
718					specified).
719
720					=item B<--mbox>
721
722					sa-learn will read in the file(s) containing the emails to be learned,
723					and will process them in mbox format (one or more emails per file).
724
725					=item B<--mbx>
726
727					sa-learn will read in the file(s) containing the emails to be learned,
728					and will process them in mbx format (one or more emails per file).
729
730					=item B<--use-ignores>
731
732					Don't learn the message if a from address matches configuration file
733					item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>.
734					The option might be used when learning from a large file of messages
735					from which the hammy spam messages or spammy ham messages have not
736					been removed.
737
738					=item B<--sync>
739
740					Synchronize the journal and databases. Upon successfully syncing the
741					database with the entries in the journal, the journal file is removed.
742
743					=item B<--force-expire>
744
745					Forces an expiry attempt, regardless of whether it may be necessary
746					or not. Note: This doesn't mean any tokens will actually expire.
747					Please see the EXPIRATION section below.
748
749					Note: C<--force-expire> also causes the journal data to be synchronized
750					into the Bayes databases.
751
752					=item B<--forget>
753
754					Forget a given message previously learnt.
755
756					=item B<--dbpath>
757
758					Allows a commandline override of the I<bayes_path> configuration option.
759
760					=item B<--dump> I<option>
761
762					Display the contents of the Bayes database. Without an option or with
763					the I<all> option, all magic tokens and data tokens will be displayed.
764					I<magic> will only display magic tokens, and I<data> will only display
765					the data tokens.
766
767					Can also use the B<--regexp> I<RE> option to specify which tokens to
768					display based on a regular expression.
769
770					=item B<--clear>
771
772					Clear an existing Bayes database by removing all traces of the database.
773
774					WARNING: This is destructive and should be used with care.
775
776					=item B<--backup>
777
778					Performs a dump of the Bayes database in machine/human readable format.
779
780					The dump will include token and seen data. It is suitable for input back
781					into the --restore command.
782
783					=item B<--restore>=I<filename>
784
785					Performs a restore of the Bayes database defined by I<filename>.
786
787					WARNING: This is a destructive operation, previous Bayes data will be wiped out.
788
789					=item B<-h>, B<--help>
790
791					Print help message and exit.
792
793					=item B<-u> I<username>, B<--username>=I<username>
794
795					If specified this username will override the username taken from the runtime
796					environment. You can use this option to specify users in a virtual user
797					configuration when using SQL as the Bayes backend.
798
799					NOTE: This option will not change to the given I<username>, it will only attempt
800					to act on behalf of that user. Because of this you will need to have proper
801					permissions to be able to change files owned by I<username>. In the case of SQL
802					this generally is not a problem.
803
804					=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
805
806					Use the specified path for locating the distributed configuration files.
807					Ignore the default directories (usually C</usr/share/spamassassin> or similar).
808
809					=item B<--siteconfigpath>=I<path>
810
811					Use the specified path for locating site-specific configuration files. Ignore
812					the default directories (usually C</etc/mail/spamassassin> or similar).
813
814					=item B<--cf='config line'>
815
816					Add additional lines of configuration directly from the command-line, parsed
817					after the configuration files are read. Multiple B<--cf> arguments can be
818					used, and each will be considered a separate line of configuration.
819
820					=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
821
822					Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>).
823
824					=item B<--progress>
825
826					Prints a progress bar (to STDERR) showing the current progress. In the case
827					where no valid terminal is found this option will behave very much like the
828					--showdots option.
829
830					=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
831
832					Produce debugging output. If no areas are listed, all debugging information is
833					printed. Diagnostic output can also be enabled for each area individually;
834					I<area> is the area of the code to instrument. For example, to produce
835					diagnostic output on bayes, learn, and dns, use:
836
837					spamassassin -D bayes,learn,dns
838
839					For more information about which areas (also known as channels) are available,
840					please see the documentation at:
841
842					C<http://wiki.apache.org/spamassassin/DebugChannels>
843
844					Higher priority informational messages that are suitable for logging in normal
845					circumstances are available with an area of "info".
846
847					=item B<--no-sync>
848
849					Skip the slow synchronization step which normally takes place after
850					changing database entries. If you plan to learn from many folders in
851					a batch, or to learn many individual messages one-by-one, it is faster
852					to use this switch and run C<sa-learn --sync> once all the folders have
853					been scanned.
854
855					Clarification: The state of I<--no-sync> overrides the
856					I<bayes_learn_to_journal> configuration option. If not specified,
857					sa-learn will learn to the database directly. If specified, sa-learn
858					will learn to the journal file.
859
860					Note: I<--sync> and I<--no-sync> can be specified on the same commandline,
861					which is slightly confusing. In this case, the I<--no-sync> option is
862					ignored since there is no learn operation.
863
864					=item B<-L>, B<--local>
865
866					Do not perform any network accesses while learning details about the mail
867					messages. This will speed up the learning process, but may result in a
868					slightly lower accuracy.
869
870					Note that this is currently ignored, as current versions of SpamAssassin will
871					not perform network access while learning; but future versions may.
872
873					=item B<--import>
874
875					If you previously used SpamAssassin's Bayesian learner without the C<DB_File>
876					module installed, it will have created files in other formats, such as
877					C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate
878					that old data into the C<DB_File> format. It will overwrite any data currently
879					in the C<DB_File>.
880
881					Can also be used with the B<--dbpath> I<path> option to specify the location of
882					the Bayes files to use.
883
884					=back
885
886					=head1 MIGRATION
887
888					There are now multiple backend storage modules available for storing
889					user's bayesian data. As such you might want to migrate from one
890					backend to another. Here is a simple procedure for migrating from one
891					backend to another.
892
893					Note that if you have individual user databases you will have to
894					perform a similar procedure for each one of them.
895
896					=over 4
897
898					=item sa-learn --sync
899
900					This will sync any outstanding journal entries
901
902					=item sa-learn --backup > backup.txt
903
904					This will save all your Bayes data to a plain text file.
905
906					=item sa-learn --clear
907
908					This is optional, but good to do to clear out the old database.
909
910					=item Repeat!
911
912					At this point, if you have multiple databases, you should perform the
913					procedure above for each of them. (i.e. each user's database needs to
914					be backed up before continuing.)
915
916					=item Switch backends
917
918					Once you have backed up all databases you can update your
919					configuration for the new database backend. This will involve at least
920					the bayes_store_module config option and may involve some additional
921					config options depending on what is required by the module. (For
922					example, you may need to configure an SQL database.)
923
924					=item sa-learn --restore backup.txt
925
926					Again, you need to do this for every database.
927
928					=back
929
930					If you are migrating to SQL you can make use of the -u <username>
931					option in sa-learn to populate each user's database. Otherwise, you
932					must run sa-learn as the user who database you are restoring.
933
934
935					=head1 INTRODUCTION TO BAYESIAN FILTERING
936
937					(Thanks to Michael Bell for this section!)
938
939					For a more lengthy description of how this works, go to
940					http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably
941					readable, even if statistics make me break out in hives.
942
943					The short semi-inaccurate version: Given training, a spam heuristics engine
944					can take the most "spammy" and "hammy" words and apply probabilistic
945					analysis. Furthermore, once given a basis for the analysis, the engine can
946					continue to learn iteratively by applying both the non-Bayesian and Bayesian
947					rulesets together to create evolving "intelligence".
948
949					SpamAssassin 2.50 and later supports Bayesian spam analysis, in
950					the form of the BAYES rules. This is a new feature, quite powerful,
951					and is disabled until enough messages have been learnt.
952
953					The pros of Bayesian spam analysis:
954
955					=over 4
956
957					=item Can greatly reduce false positives and false negatives.
958
959					It learns from your mail, so it is tailored to your unique e-mail flow.
960
961					=item Once it starts learning, it can continue to learn from SpamAssassin
962					and improve over time.
963
964					=back
965
966					And the cons:
967
968					=over 4
969
970					=item A decent number of messages are required before results are useful
971					for ham/spam determination.
972
973					=item It's hard to explain why a message is or isn't marked as spam.
974
975					i.e.: a straightforward rule, that matches, say, "VIAGRA" is
976					easy to understand. If it generates a false positive or false negative,
977					it is fairly easy to understand why.
978
979					With Bayesian analysis, it's all probabilities - "because the past says
980					it is likely as this falls into a probabilistic distribution common to past
981					spam in your systems". Tell that to your users! Tell that to the client
982					when he asks "what can I do to change this". (By the way, the answer in
983					this case is "use whitelisting".)
984
985					=item It will take disk space and memory.
986
987					The databases it maintains take quite a lot of resources to store and use.
988
989					=back
990
991					=head1 GETTING STARTED
992
993					Still interested? Ok, here's the guidelines for getting this working.
994
995					First a high-level overview:
996
997					=over 4
998
999					=item Build a significant sample of both ham and spam.
1000
1001					I suggest several thousand of each, placed in SPAM and HAM directories or
1002					mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much
1003					better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY
1004					message. You're urged to avoid using a publicly available corpus (sample) -
1005					this must be taken from YOUR mail server, if it is to be statistically useful.
1006					Otherwise, the results may be pretty skewed.
1007
1008					=item Use this tool to teach SpamAssassin about these samples, like so:
1009
1010					sa-learn --spam /path/to/spam/folder
1011					sa-learn --ham /path/to/ham/folder
1012					...
1013
1014					Let SpamAssassin proceed, learning stuff. When it finds ham and spam
1015					it will add the "interesting tokens" to the database.
1016
1017					=item If you need SpamAssassin to forget about specific messages, use
1018					the B<--forget> option.
1019
1020					This can be applied to either ham or spam that has run through the
1021					B<sa-learn> processes. It's a bit of a hammer, really, lowering the
1022					weighting of the specific tokens in that message (only if that message has
1023					been processed before).
1024
1025					=item Learning from single messages uses a command like this:
1026
1027					sa-learn --ham --no-sync mailmessage
1028
1029					This is handy for binding to a key in your mail user agent. It's very fast, as
1030					all the time-consuming stuff is deferred until you run with the C<--sync>
1031					option.
1032
1033					=item Autolearning is enabled by default
1034
1035					If you don't have a corpus of mail saved to learn, you can let
1036					SpamAssassin automatically learn the mail that you receive. If you are
1037					autolearning from scratch, the amount of mail you receive will determine
1038					how long until the BAYES_* rules are activated.
1039
1040					=back
1041
1042					=head1 EFFECTIVE TRAINING
1043
1044					Learning filters require training to be effective. If you don't train
1045					them, they won't work. In addition, you need to train them with new
1046					messages regularly to keep them up-to-date, or their data will become
1047					stale and impact accuracy.
1048
1049					You need to train with both spam I<and> ham mails. One type of mail
1050					alone will not have any effect.
1051
1052					Note that if your mail folders contain things like forwarded spam,
1053					discussions of spam-catching rules, etc., this will cause trouble. You
1054					should avoid scanning those messages if possible. (An easy way to do this
1055					is to move them aside, into a folder which is not scanned.)
1056
1057					If the messages you are learning from have already been filtered through
1058					SpamAssassin, the learner will compensate for this. In effect, it learns what
1059					each message would look like if you had run C<spamassassin -d> over it in
1060					advance.
1061
1062					Another thing to be aware of, is that typically you should aim to train
1063					with at least 1000 messages of spam, and 1000 ham messages, if
1064					possible. More is better, but anything over about 5000 messages does not
1065					improve accuracy significantly in our tests.
1066
1067					Be careful that you train from the same source -- for example, if you train
1068					on old spam, but new ham mail, then the classifier will think that
1069					a mail with an old date stamp is likely to be spam.
1070
1071					It's also worth noting that training with a very small quantity of
1072					ham, will produce atrocious results. You should aim to train with at
1073					least the same amount (or more if possible!) of ham data than spam.
1074
1075					On an on-going basis, it is best to keep training the filter to make
1076					sure it has fresh data to work from. There are various ways to do
1077					this:
1078
1079					=over 4
1080
1081					=item 1. Supervised learning
1082
1083					This means keeping a copy of all or most of your mail, separated into spam
1084					and ham piles, and periodically re-training using those. It produces
1085					the best results, but requires more work from you, the user.
1086
1087					(An easy way to do this, by the way, is to create a new folder for
1088					'deleted' messages, and instead of deleting them from other folders,
1089					simply move them in there instead. Then keep all spam in a separate
1090					folder and never delete it. As long as you remember to move misclassified
1091					mails into the correct folder set, it is easy enough to keep up to date.)
1092
1093					=item 2. Unsupervised learning from Bayesian classification
1094
1095					Another way to train is to chain the results of the Bayesian classifier
1096					back into the training, so it reinforces its own decisions. This is only
1097					safe if you then retrain it based on any errors you discover.
1098
1099					SpamAssassin does not support this method, due to experimental results
1100					which strongly indicate that it does not work well, and since Bayes is
1101					only one part of the resulting score presented to the user (while Bayes
1102					may have made the wrong decision about a mail, it may have been overridden
1103					by another system).
1104
1105					=item 3. Unsupervised learning from SpamAssassin rules
1106
1107					Also called 'auto-learning' in SpamAssassin. Based on statistical
1108					analysis of the SpamAssassin success rates, we can automatically train the
1109					Bayesian database with a certain degree of confidence that our training
1110					data is accurate.
1111
1112					It should be supplemented with some supervised training in addition, if
1113					possible.
1114
1115					This is the default, but can be turned off by setting the SpamAssassin
1116					configuration parameter C<bayes_auto_learn> to 0.
1117
1118					=item 4. Mistake-based training
1119
1120					This means training on a small number of mails, then only training on
1121					messages that SpamAssassin classifies incorrectly. This works, but it
1122					takes longer to get it right than a full training session would.
1123
1124					=back
1125
1126					=head1 FILES
1127
1128					B<sa-learn> and the other parts of SpamAssassin's Bayesian learner,
1129					use a set of persistent database files to store the learnt tokens, as follows.
1130
1131					=over 4
1132
1133					=item bayes_toks
1134
1135					The database of tokens, containing the tokens learnt, their count of
1136					occurrences in ham and spam, and the timestamp when the token was last
1137					seen in a message.
1138
1139					This database also contains some 'magic' tokens, as follows: the version
1140					number of the database, the number of ham and spam messages learnt, the
1141					number of tokens in the database, and timestamps of: the last journal
1142					sync, the last expiry run, the last expiry token reduction count, the
1143					last expiry timestamp delta, the oldest token timestamp in the database,
1144					and the newest token timestamp in the database.
1145
1146					This is a database file, using C<DB_File>. The database 'version
1147					number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x
1148					development releases, 2 for 2.6x, and 3 for 3.0 and later releases.
1149
1150					=item bayes_seen
1151
1152					A map of Message-Id and some data from headers and body to what that
1153					message was learnt as. This is used so that SpamAssassin can avoid
1154					re-learning a message it has already seen, and so it can reverse the
1155					training if you later decide that message was learnt incorrectly.
1156
1157					This is a database file, using C<DB_File>.
1158
1159					=item bayes_journal
1160
1161					While SpamAssassin is scanning mails, it needs to track which tokens
1162					it uses in its calculations. To avoid the contention of having each
1163					SpamAssassin process attempting to gain write access to the Bayes DB,
1164					the token timestamps are written to a 'journal' file which will later
1165					(either automatically or via C<sa-learn --sync>) be used to synchronize
1166					the Bayes DB.
1167
1168					Also, through the use of C<bayes_learn_to_journal>, or when using the
1169					C<--no-sync> option with sa-learn, the actual learning data will take
1170					be placed into the journal for later synchronization. This is typically
1171					useful for high-traffic sites to avoid the same contention as stated
1172					above.
1173
1174					=back
1175
1176					=head1 EXPIRATION
1177
1178					Since SpamAssassin can auto-learn messages, the Bayes database files
1179					could increase perpetually until they fill your disk. To control this,
1180					SpamAssassin performs journal synchronization and bayes expiration
1181					periodically when certain criteria (listed below) are met.
1182
1183					SpamAssassin can sync the journal and expire the DB tokens either
1184					manually or opportunistically. A journal sync is due if I<--sync>
1185					is passed to sa-learn (manual), or if the following is true
1186					(opportunistic):
1187
1188					=over 4
1189
1190					=item - bayes_journal_max_size does not equal 0 (means don't sync)
1191
1192					=item - the journal file exists
1193
1194					=back
1195
1196					and either:
1197
1198					=over 4
1199
1200					=item - the journal file has a size greater than bayes_journal_max_size
1201
1202					=back
1203
1204					or
1205
1206					=over 4
1207
1208					=item - a journal sync has previously occurred, and at least 1 day has
1209					passed since that sync
1210
1211					=back
1212
1213					Expiry is due if I<--force-expire> is passed to sa-learn (manual),
1214					or if all of the following are true (opportunistic):
1215
1216					=over 4
1217
1218					=item - the last expire was attempted at least 12hrs ago
1219
1220					=item - bayes_auto_expire does not equal 0
1221
1222					=item - the number of tokens in the DB is > 100,000
1223
1224					=item - the number of tokens in the DB is > bayes_expiry_max_db_size
1225
1226					=item - there is at least a 12 hr difference between the oldest and newest token atimes
1227
1228					=back
1229
1230					=head2 EXPIRE LOGIC
1231
1232					If either the manual or opportunistic method causes an expire run
1233					to start, here is the logic that is used:
1234
1235					=over 4
1236
1237					=item - figure out how many tokens to keep. take the larger of
1238					either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal
1239					reduction is number of tokens - number of tokens to keep.
1240
1241					=item - if the reduction number is < 1000 tokens, abort (not worth the effort).
1242
1243					=item - if an expire has been done before, guesstimate the new
1244					atime delta based on the old atime delta. (new_atime_delta =
1245					old_atime_delta * old_reduction_count / goal)
1246
1247					=item - if no expire has been done before, or the last expire looks
1248					"weird", do an estimation pass. The definition of "weird" is:
1249
1250					=over 8
1251
1252					=item - last expire over 30 days ago
1253
1254					=item - last atime delta was < 12 hrs
1255
1256					=item - last reduction count was < 1000 tokens
1257
1258					=item - estimated new atime delta is < 12 hrs
1259
1260					=item - the difference between the last reduction count and the goal reduction count is > 50%
1261
1262					=back
1263
1264					=back
1265
1266					=head2 ESTIMATION PASS LOGIC
1267
1268					Go through each of the DB's tokens. Starting at 12hrs, calculate
1269					whether or not the token would be expired (based on the difference
1270					between the token's atime and the db's newest token atime) and keep
1271					the count. Work out from 12hrs exponentially by powers of 2. ie:
1272					12hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs
1273					* 512 (6144hrs, or 256 days).
1274
1275					The larger the delta, the smaller the number of tokens that will
1276					be expired. Conversely, the number of tokens goes up as the delta
1277					gets smaller. So starting at the largest atime delta, figure out
1278					which delta will expire the most tokens without going above the
1279					goal expiration count. Use this to choose the atime delta to use,
1280					unless one of the following occurs:
1281
1282					=over 8
1283
1284					=item - the largest atime (smallest reduction count) would expire
1285					too many tokens. this means the learned tokens are mostly old and
1286					there needs to be new tokens learned before an expire can
1287					occur.
1288
1289					=item - all of the atime choices result in 0 tokens being removed.
1290					this means the tokens are all newer than 12 hours and there needs
1291					to be new tokens learned before an expire can occur.
1292
1293					=item - the number of tokens that would be removed is < 1000. the
1294					benefit isn't worth the effort. more tokens need to be learned.
1295
1296					=back
1297
1298					If the expire run gets past this point, it will continue to the end.
1299					A new DB is created since the majority of DB libraries don't shrink the
1300					DB file when tokens are removed. So we do the "create new, migrate old
1301					to new, remove old, rename new" shuffle.
1302
1303					=head2 EXPIRY RELATED CONFIGURATION SETTINGS
1304
1305					=over 4
1306
1307					=item C<bayes_auto_expire> is used to specify whether or not SpamAssassin
1308					ought to opportunistically attempt to expire the Bayes database.
1309					The default is 1 (yes).
1310
1311					=item C<bayes_expiry_max_db_size> specifies both the auto-expire token
1312					count point, as well as the resulting number of tokens after expiry
1313					as described above. The default value is 150,000, which is roughly
1314					equivalent to a 6Mb database file if you're using DB_File.
1315
1316					=item C<bayes_journal_max_size> specifies how large the Bayes
1317					journal will grow before it is opportunistically synced. The
1318					default value is 102400.
1319
1320					=back
1321
1322					=head1 INSTALLATION
1323
1324					The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module.
1325					Install this as a normal Perl module, using C<perl -MCPAN -e shell>,
1326					or by hand.
1327
1328					=head1 SEE ALSO
1329
1330					spamassassin(1)
1331					spamc(1)
1332					Mail::SpamAssassin(3)
1333					Mail::SpamAssassin::ArchiveIterator(3)
1334
1335					E<lt>http://www.paulgraham.com/E<gt>
1336					Paul Graham's "A Plan For Spam" paper
1337
1338					E<lt>http://www.linuxjournal.com/article/6467E<gt>
1339					Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin
1340
1341					E<lt>http://www.bgl.nu/~glouis/bogofilter/E<gt>
1342					'Training on error' page. A discussion of various Bayes training regimes,
1343					including 'train on error' and unsupervised training.
1344
1345					=head1 PREREQUISITES
1346
1347					C<Mail::SpamAssassin>
1348
1349					=head1 AUTHORS
1350
1351					The SpamAssassin(tm) Project E<lt>http://spamassassin.apache.org/E<gt>
1352
1353					=cut
1354

					# spent 9.15ms within Encode::XS::decode which was called 1038 times, avg 9µs/call: # 1038 times (9.15ms+0s) by Net::DNS::Domain::_decode_ascii at line 299 of Net/DNS/Domain.pm, avg 9µs/call sub Encode::XS::decode; # xsub
					# spent 970µs within Internals::SvREADONLY which was called 148 times, avg 7µs/call: # 146 times (955µs+0s) by constant::import at line 164 of constant.pm, avg 7µs/call # once (12µs+0s) by constant::BEGIN@24 at line 33 of constant.pm # once (2µs+0s) by constant::BEGIN@24 at line 34 of constant.pm sub Internals::SvREADONLY; # xsub
					# spent 127µs within UNIVERSAL::VERSION which was called 6 times, avg 21µs/call: # once (30µs+0s) by NetAddr::IP::BEGIN@8 at line 8 of NetAddr/IP.pm # once (23µs+0s) by Encode::BEGIN@12 at line 12 of Encode.pm # once (22µs+0s) by Pod::Simple::BEGIN@8 at line 8 of Pod/Simple.pm # once (19µs+0s) by Mail::SpamAssassin::Util::BEGIN@76 at line 76 of Mail/SpamAssassin/Util.pm # once (19µs+0s) by Mail::SpamAssassin::NetSet::BEGIN@26 at line 26 of Mail/SpamAssassin/NetSet.pm # once (14µs+0s) by NetAddr::IP::BEGIN@9 at line 21 of NetAddr/IP.pm sub UNIVERSAL::VERSION; # xsub
					# spent 16.7ms within UNIVERSAL::can which was called 3017 times, avg 6µs/call: # 1968 times (9.54ms+0s) by Mail::SpamAssassin::DnsResolver::new_dns_packet at line 602 of Mail/SpamAssassin/DnsResolver.pm, avg 5µs/call # 324 times (2.44ms+0s) by Mail::SpamAssassin::PluginHandler::have_callback at line 166 of Mail/SpamAssassin/PluginHandler.pm, avg 8µs/call # 234 times (2.02ms+0s) by Mail::SpamAssassin::Message::Metadata::parse_received_headers at line 272 of Mail/SpamAssassin/Message/Metadata/Received.pm, avg 9µs/call # 234 times (1.07ms+0s) by Mail::SpamAssassin::Message::Metadata::parse_received_headers at line 278 of Mail/SpamAssassin/Message/Metadata/Received.pm, avg 5µs/call # 189 times (915µs+0s) by Mail::SpamAssassin::HTML::parse at line 250 of Mail/SpamAssassin/HTML.pm, avg 5µs/call # 55 times (570µs+0s) by Mail::SpamAssassin::Conf::Parser::cond_clause_can_or_has at line 595 of Mail/SpamAssassin/Conf/Parser.pm, avg 10µs/call # 6 times (37µs+0s) by Mail::SpamAssassin::Util::reverse_ip_address at line 906 of Mail/SpamAssassin/Util.pm, avg 6µs/call # 3 times (44µs+0s) by IO::Socket::SSL::BEGIN@389 at line 399 of IO/Socket/SSL.pm, avg 15µs/call # once (6µs+0s) by Mail::SpamAssassin::DnsResolver::configured_nameservers at line 213 of Mail/SpamAssassin/DnsResolver.pm # once (6µs+0s) by Mail::SpamAssassin::DnsResolver::configured_nameservers at line 212 of Mail/SpamAssassin/DnsResolver.pm # once (5µs+0s) by Mail::SpamAssassin::AsyncLoop::BEGIN@49 at line 52 of Mail/SpamAssassin/AsyncLoop.pm # once (5µs+0s) by Net::DNS::Domain::BEGIN@54 at line 1 of (eval 27)[Net/DNS/Domain.pm:54] sub UNIVERSAL::can; # xsub
					# spent 280µs within UNIVERSAL::isa which was called 57 times, avg 5µs/call: # 27 times (139µs+0s) by base::import at line 97 of base.pm, avg 5µs/call # 27 times (120µs+0s) by main::RUNTIME at line 243, avg 4µs/call # 2 times (16µs+0s) by File::Path::mkpath at line 94 of File/Path.pm, avg 8µs/call # once (5µs+0s) by Getopt::Long::GetOptionsFromArray at line 474 of Getopt/Long.pm sub UNIVERSAL::isa; # xsub
					# spent 12µs within main::CORE:close which was called 2 times, avg 6µs/call: # once (8µs+0s) by main::RUNTIME at line 503 # once (4µs+0s) by main::RUNTIME at line 504 sub main::CORE:close; # opcode
					# spent 47µs within main::CORE:ftis which was called 2 times, avg 24µs/call: # 2 times (47µs+0s) by main::BEGIN@41 at line 46, avg 24µs/call sub main::CORE:ftis; # opcode
					# spent 24µs within main::CORE:match which was called 4 times, avg 6µs/call: # 2 times (13µs+0s) by main::RUNTIME at line 423, avg 7µs/call # 2 times (11µs+0s) by main::RUNTIME at line 449, avg 6µs/call sub main::CORE:match; # opcode
					# spent 142µs within main::CORE:pack which was called 24 times, avg 6µs/call: # 2 times (12µs+0s) by Net::DNS::Resolver::Base::BEGIN@33 at line 297 of IO/Socket/INET6.pm, avg 6µs/call # once (15µs+0s) by Net::DNS::RR::BEGIN@42 at line 50 of Net/DNS/Domain.pm # once (12µs+0s) by NetAddr::IP::BEGIN@8 at line 201 of NetAddr/IP/Lite.pm # once (9µs+0s) by Mail::SpamAssassin::PerMsgStatus::BEGIN@35 at line 319 of IO/Socket.pm # once (8µs+0s) by NetAddr::IP::Lite::BEGIN@18 at line 153 of NetAddr/IP/Util.pm # once (8µs+0s) by Net::DNS::Resolver::Base::BEGIN@1.1 at line 523 of IO/Socket/IP.pm # once (8µs+0s) by NetAddr::IP::Lite::BEGIN@18 at line 200 of NetAddr/IP/Util.pm # once (8µs+0s) by Net::DNS::RR::BEGIN@43 at line 72 of Net/DNS/DomainName.pm # once (7µs+0s) by NetAddr::IP::Lite::BEGIN@9 at line 256 of NetAddr/IP/InetBase.pm # once (7µs+0s) by NetAddr::IP::Lite::BEGIN@9 at line 244 of NetAddr/IP/InetBase.pm # once (6µs+0s) by Net::DNS::Resolver::Base::BEGIN@57 at line 763 of Net/DNS/Packet.pm # once (5µs+0s) by NetAddr::IP::BEGIN@8 at line 1420 of NetAddr/IP/Lite.pm # once (5µs+0s) by NetAddr::IP::BEGIN@8 at line 416 of NetAddr/IP/Lite.pm # once (4µs+0s) by Net::DNS::RR::OPT::CLIENT_SUBNET::BEGIN@240 at line 52 of Net/DNS/RR/A.pm # once (4µs+0s) by NetAddr::IP::BEGIN@8 at line 683 of NetAddr/IP/Lite.pm # once (4µs+0s) by NetAddr::IP::BEGIN@8 at line 206 of NetAddr/IP/Lite.pm # once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 202 of NetAddr/IP/Lite.pm # once (3µs+0s) by NetAddr::IP::Lite::BEGIN@18 at line 201 of NetAddr/IP/Util.pm # once (3µs+0s) by Net::DNS::RR::BEGIN@43 at line 213 of Net/DNS/DomainName.pm # once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 685 of NetAddr/IP/Lite.pm # once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 684 of NetAddr/IP/Lite.pm # once (3µs+0s) by NetAddr::IP::Lite::BEGIN@9 at line 245 of NetAddr/IP/InetBase.pm # once (3µs+0s) by NetAddr::IP::BEGIN@8 at line 204 of NetAddr/IP/Lite.pm sub main::CORE:pack; # opcode
					# spent 32µs within main::CORE:print which was called: # once (32µs+0s) by main::RUNTIME at line 485 sub main::CORE:print; # opcode
					# spent 569µs within mro::method_changed_in which was called 147 times, avg 4µs/call: # 147 times (569µs+0s) by constant::import at line 198 of constant.pm, avg 4µs/call sub mro::method_changed_in; # xsub
					# spent 109µs within utf8::encode which was called 26 times, avg 4µs/call: # 24 times (98µs+0s) by base::__ANON__[/usr/local/lib/perl5/5.24/base.pm:77] at line 75 of base.pm, avg 4µs/call # once (8µs+0s) by Pod::Simple::LinkSection::BEGIN@9 at line 41 of Pod/Simple/BlackBox.pm # once (3µs+0s) by Encode::encode_utf8 at line 231 of Encode.pm sub utf8::encode; # xsub
					# spent 17.0ms within utf8::is_utf8 which was called 3936 times, avg 4µs/call: # 1968 times (9.24ms+0s) by Mail::SpamAssassin::DnsResolver::new_dns_packet at line 549 of Mail/SpamAssassin/DnsResolver.pm, avg 5µs/call # 1968 times (7.72ms+0s) by Mail::SpamAssassin::Util::decode_dns_question_entry at line 940 of Mail/SpamAssassin/Util.pm, avg 4µs/call sub utf8::is_utf8; # xsub