Filename | /usr/local/lib/perl5/site_perl/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm |
Statements | Executed 29 statements in 1.95ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 44µs | 246µs | new | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 38µs | 38µs | BEGIN@54 | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 37µs | 184µs | set_config | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 26µs | 195µs | BEGIN@55 | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 23µs | 36µs | BEGIN@56 | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 22µs | 60µs | BEGIN@57 | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 21µs | 28µs | BEGIN@58 | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 21µs | 87µs | BEGIN@59 | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
1 | 1 | 1 | 20µs | 85µs | BEGIN@61 | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
0 | 0 | 0 | 0s | 0s | autolearn_discriminator | Mail::SpamAssassin::Plugin::AutoLearnThreshold::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # <@LICENSE> | ||||
2 | # Licensed to the Apache Software Foundation (ASF) under one or more | ||||
3 | # contributor license agreements. See the NOTICE file distributed with | ||||
4 | # this work for additional information regarding copyright ownership. | ||||
5 | # The ASF licenses this file to you under the Apache License, Version 2.0 | ||||
6 | # (the "License"); you may not use this file except in compliance with | ||||
7 | # the License. You may obtain a copy of the License at: | ||||
8 | # | ||||
9 | # http://www.apache.org/licenses/LICENSE-2.0 | ||||
10 | # | ||||
11 | # Unless required by applicable law or agreed to in writing, software | ||||
12 | # distributed under the License is distributed on an "AS IS" BASIS, | ||||
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
14 | # See the License for the specific language governing permissions and | ||||
15 | # limitations under the License. | ||||
16 | # </@LICENSE> | ||||
17 | |||||
18 | =head1 NAME | ||||
19 | |||||
20 | Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning | ||||
21 | |||||
22 | =head1 SYNOPSIS | ||||
23 | |||||
24 | loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold | ||||
25 | |||||
26 | =head1 DESCRIPTION | ||||
27 | |||||
28 | This plugin implements the threshold-based auto-learning discriminator | ||||
29 | for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism | ||||
30 | whereby high-scoring mails (or low-scoring mails, for non-spam) are fed | ||||
31 | into its learning systems without user intervention, during scanning. | ||||
32 | |||||
33 | Note that certain tests are ignored when determining whether a message | ||||
34 | should be trained upon: | ||||
35 | |||||
36 | =over 4 | ||||
37 | |||||
38 | =item * rules with tflags set to 'learn' (the Bayesian rules) | ||||
39 | |||||
40 | =item * rules with tflags set to 'userconf' (user configuration) | ||||
41 | |||||
42 | =item * rules with tflags set to 'noautolearn' | ||||
43 | |||||
44 | =back | ||||
45 | |||||
46 | Also note that auto-learning occurs using scores from either scoreset 0 | ||||
47 | or 1, depending on what scoreset is used during message check. It is | ||||
48 | likely that the message check and auto-learn scores will be different. | ||||
49 | |||||
50 | =cut | ||||
51 | |||||
52 | package Mail::SpamAssassin::Plugin::AutoLearnThreshold; | ||||
53 | |||||
54 | 2 | 62µs | 1 | 38µs | # spent 38µs within Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@54 which was called:
# once (38µs+0s) by Mail::SpamAssassin::PluginHandler::load_plugin at line 54 # spent 38µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@54 |
55 | 2 | 59µs | 2 | 365µs | # spent 195µs (26+169) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@55 which was called:
# once (26µs+169µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 55 # spent 195µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@55
# spent 169µs making 1 call to Exporter::import |
56 | 2 | 58µs | 2 | 49µs | # spent 36µs (23+13) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@56 which was called:
# once (23µs+13µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 56 # spent 36µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@56
# spent 13µs making 1 call to strict::import |
57 | 2 | 57µs | 2 | 99µs | # spent 60µs (22+38) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@57 which was called:
# once (22µs+38µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 57 # spent 60µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@57
# spent 38µs making 1 call to warnings::import |
58 | 2 | 57µs | 2 | 35µs | # spent 28µs (21+7) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@58 which was called:
# once (21µs+7µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 58 # spent 28µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@58
# spent 7µs making 1 call to bytes::import |
59 | 2 | 60µs | 2 | 154µs | # spent 87µs (21+67) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@59 which was called:
# once (21µs+67µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 59 # spent 87µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@59
# spent 67µs making 1 call to re::import |
60 | |||||
61 | 2 | 1.50ms | 2 | 150µs | # spent 85µs (20+65) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@61 which was called:
# once (20µs+65µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 61 # spent 85µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::BEGIN@61
# spent 65µs making 1 call to vars::import |
62 | 1 | 15µs | @ISA = qw(Mail::SpamAssassin::Plugin); | ||
63 | |||||
64 | # spent 246µs (44+202) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::new which was called:
# once (44µs+202µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 1 of (eval 73)[Mail/SpamAssassin/PluginHandler.pm:129] | ||||
65 | 1 | 2µs | my $class = shift; | ||
66 | 1 | 2µs | my $mailsaobject = shift; | ||
67 | |||||
68 | 1 | 2µs | $class = ref($class) || $class; | ||
69 | 1 | 10µs | 1 | 19µs | my $self = $class->SUPER::new($mailsaobject); # spent 19µs making 1 call to Mail::SpamAssassin::Plugin::new |
70 | 1 | 2µs | bless ($self, $class); | ||
71 | |||||
72 | 1 | 8µs | 1 | 184µs | $self->set_config($mailsaobject->{conf}); # spent 184µs making 1 call to Mail::SpamAssassin::Plugin::AutoLearnThreshold::set_config |
73 | |||||
74 | 1 | 10µs | return $self; | ||
75 | } | ||||
76 | |||||
77 | # spent 184µs (37+147) within Mail::SpamAssassin::Plugin::AutoLearnThreshold::set_config which was called:
# once (37µs+147µs) by Mail::SpamAssassin::Plugin::AutoLearnThreshold::new at line 72 | ||||
78 | 1 | 2µs | my($self, $conf) = @_; | ||
79 | 1 | 2µs | my @cmds; | ||
80 | |||||
81 | =head1 USER OPTIONS | ||||
82 | |||||
83 | The following configuration settings are used to control auto-learning: | ||||
84 | |||||
85 | =over 4 | ||||
86 | |||||
87 | =item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1) | ||||
88 | |||||
89 | The score threshold below which a mail has to score, to be fed into | ||||
90 | SpamAssassin's learning systems automatically as a non-spam message. | ||||
91 | |||||
92 | =cut | ||||
93 | |||||
94 | 1 | 6µs | push (@cmds, { | ||
95 | setting => 'bayes_auto_learn_threshold_nonspam', | ||||
96 | default => 0.1, | ||||
97 | type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC | ||||
98 | }); | ||||
99 | |||||
100 | =item bayes_auto_learn_threshold_spam n.nn (default: 12.0) | ||||
101 | |||||
102 | The score threshold above which a mail has to score, to be fed into | ||||
103 | SpamAssassin's learning systems automatically as a spam message. | ||||
104 | |||||
105 | Note: SpamAssassin requires at least 3 points from the header, and 3 | ||||
106 | points from the body to auto-learn as spam. Therefore, the minimum | ||||
107 | working value for this option is 6. | ||||
108 | |||||
109 | If the test option autolearn_force is set, the minimum value will | ||||
110 | remain at 6 points but there is no requirement that the points come | ||||
111 | from body and header rules. This option is useful for autolearning | ||||
112 | with rules that are considered to be extremely safe indicators of | ||||
113 | the spaminess of a message. | ||||
114 | |||||
115 | =cut | ||||
116 | |||||
117 | 1 | 4µs | push (@cmds, { | ||
118 | setting => 'bayes_auto_learn_threshold_spam', | ||||
119 | default => 12.0, | ||||
120 | type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC | ||||
121 | }); | ||||
122 | |||||
123 | =item bayes_auto_learn_on_error (0 | 1) (default: 0) | ||||
124 | |||||
125 | With C<bayes_auto_learn_on_error> off, autolearning will be performed | ||||
126 | even if bayes classifier already agrees with the new classification (i.e. | ||||
127 | yielded BAYES_00 for what we are now trying to teach it as ham, or yielded | ||||
128 | BAYES_99 for spam). This is a traditional setting, the default was chosen | ||||
129 | to retain backward compatibility. | ||||
130 | |||||
131 | With C<bayes_auto_learn_on_error> turned on, autolearning will be performed | ||||
132 | only when a bayes classifier had a different opinion from what the autolearner | ||||
133 | is now trying to teach it (i.e. it made an error in judgement). This strategy | ||||
134 | may or may not produce better future classifications, but usually works | ||||
135 | very well, while also preventing unnecessary overlearning and slows down | ||||
136 | database growth. | ||||
137 | |||||
138 | =cut | ||||
139 | |||||
140 | 1 | 6µs | push (@cmds, { | ||
141 | setting => 'bayes_auto_learn_on_error', | ||||
142 | default => 0, | ||||
143 | type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL | ||||
144 | }); | ||||
145 | |||||
146 | 1 | 16µs | 1 | 147µs | $conf->{parser}->register_commands(\@cmds); # spent 147µs making 1 call to Mail::SpamAssassin::Conf::Parser::register_commands |
147 | } | ||||
148 | |||||
149 | sub autolearn_discriminator { | ||||
150 | my ($self, $params) = @_; | ||||
151 | |||||
152 | my $scan = $params->{permsgstatus}; | ||||
153 | my $conf = $scan->{conf}; | ||||
154 | |||||
155 | # Figure out min/max for autolearning. | ||||
156 | # Default to specified auto_learn_threshold settings | ||||
157 | my $min = $conf->{bayes_auto_learn_threshold_nonspam}; | ||||
158 | my $max = $conf->{bayes_auto_learn_threshold_spam}; | ||||
159 | |||||
160 | # Find out what score we should consider this message to have ... | ||||
161 | my $score = $scan->get_autolearn_points(); | ||||
162 | my $body_only_points = $scan->get_body_only_points(); | ||||
163 | my $head_only_points = $scan->get_head_only_points(); | ||||
164 | my $learned_points = $scan->get_learned_points(); | ||||
165 | |||||
166 | # find out if any of the tests added an autolearn_force status | ||||
167 | my $force_autolearn = $scan->get_autolearn_force_status(); | ||||
168 | my $force_autolearn_names = $scan->get_autolearn_force_names(); | ||||
169 | |||||
170 | dbg("learn: auto-learn? ham=$min, spam=$max, ". | ||||
171 | "body-points=".$body_only_points.", ". | ||||
172 | "head-points=".$head_only_points.", ". | ||||
173 | "learned-points=".$learned_points); | ||||
174 | |||||
175 | my $isspam; | ||||
176 | if ($score < $min) { | ||||
177 | $isspam = 0; | ||||
178 | } elsif ($score >= $max) { | ||||
179 | $isspam = 1; | ||||
180 | } else { | ||||
181 | dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam"); | ||||
182 | return; | ||||
183 | } | ||||
184 | |||||
185 | my $learner_said_ham_points = -1.0; | ||||
186 | my $learner_said_spam_points = 1.0; | ||||
187 | |||||
188 | if ($isspam) { | ||||
189 | my $required_body_points = 3; | ||||
190 | my $required_head_points = 3; | ||||
191 | |||||
192 | #Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule | ||||
193 | if ($force_autolearn) { | ||||
194 | $required_body_points = -99; | ||||
195 | $required_head_points = -99; | ||||
196 | dbg("learn: auto-learn: autolearn_force flagged for a rule. Removing seperate body and head point threshold. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); | ||||
197 | dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names"); | ||||
198 | } else { | ||||
199 | dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); | ||||
200 | } | ||||
201 | |||||
202 | if ($body_only_points < $required_body_points) { | ||||
203 | dbg("learn: auto-learn? no: scored as spam but too few body points (". | ||||
204 | $body_only_points." < ".$required_body_points.")"); | ||||
205 | return; | ||||
206 | } | ||||
207 | if ($head_only_points < $required_head_points) { | ||||
208 | dbg("learn: auto-learn? no: scored as spam but too few head points (". | ||||
209 | $head_only_points." < ".$required_head_points.")"); | ||||
210 | return; | ||||
211 | } | ||||
212 | if ($learned_points < $learner_said_ham_points) { | ||||
213 | dbg("learn: auto-learn? no: scored as spam but learner indicated ham (". | ||||
214 | $learned_points." < ".$learner_said_ham_points.")"); | ||||
215 | return; | ||||
216 | } | ||||
217 | |||||
218 | if (!$scan->is_spam()) { | ||||
219 | dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam"); | ||||
220 | return; | ||||
221 | } | ||||
222 | |||||
223 | } else { | ||||
224 | if ($learned_points > $learner_said_spam_points) { | ||||
225 | dbg("learn: auto-learn? no: scored as ham but learner indicated spam (". | ||||
226 | $learned_points." > ".$learner_said_spam_points.")"); | ||||
227 | return; | ||||
228 | } | ||||
229 | |||||
230 | if ($scan->is_spam()) { | ||||
231 | dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham"); | ||||
232 | return; | ||||
233 | } | ||||
234 | } | ||||
235 | |||||
236 | if ($conf->{bayes_auto_learn_on_error}) { | ||||
237 | # learn-on-error strategy chosen: | ||||
238 | # only allow learning if the autolearning classifier was unsure or | ||||
239 | # had a different opinion from what we are trying to make it learn | ||||
240 | # | ||||
241 | my $tests = $scan->get_tag('TESTS'); | ||||
242 | if (defined $tests && $tests ne 'none') { | ||||
243 | my %t = map { ($_,1) } split(/,/, $tests); | ||||
244 | if ($isspam && $t{'BAYES_99'} || !$isspam && $t{'BAYES_00'}) { | ||||
245 | dbg("learn: auto-learn? no: learn-on-error, %s, already classified ". | ||||
246 | "as such", $isspam ? 'spam' : 'ham'); | ||||
247 | return; | ||||
248 | } | ||||
249 | } | ||||
250 | } | ||||
251 | |||||
252 | dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no")); | ||||
253 | |||||
254 | #Return an array reference because call_plugins only carry's one return value | ||||
255 | return [$isspam, $force_autolearn, $force_autolearn_names]; | ||||
256 | } | ||||
257 | |||||
258 | 1 | 8µs | 1; | ||
259 | |||||
260 | =back | ||||
261 | |||||
262 | =cut |