Filename | /usr/local/lib/perl5/site_perl/Mail/SpamAssassin/BayesStore.pm |
Statements | Executed 2136 statements in 17.1ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
236 | 2 | 1 | 9.77ms | 12.1ms | read_db_configs | Mail::SpamAssassin::BayesStore::
234 | 2 | 1 | 2.34ms | 2.34ms | DB_VERSION | Mail::SpamAssassin::BayesStore::
1 | 1 | 1 | 56µs | 64µs | BEGIN@31 | Mail::SpamAssassin::BayesStore::
1 | 1 | 1 | 25µs | 153µs | BEGIN@35 | Mail::SpamAssassin::BayesStore::
1 | 1 | 1 | 24µs | 29µs | BEGIN@33 | Mail::SpamAssassin::BayesStore::
1 | 1 | 1 | 21µs | 44µs | BEGIN@32 | Mail::SpamAssassin::BayesStore::
1 | 1 | 1 | 21µs | 21µs | new | Mail::SpamAssassin::BayesStore::
1 | 1 | 1 | 19µs | 68µs | BEGIN@34 | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | backup_database | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | calculate_expire_delta | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | cleanup | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | clear_database | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | db_readable | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | db_writable | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | dump_db_toks | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | expire_old_tokens | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | expire_old_tokens_trapped | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | expiry_due | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | get_magic_re | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | get_running_expire_tok | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | get_storage_variables | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | multi_tok_count_change | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | nspam_nham_change | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | nspam_nham_get | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | perform_upgrade | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | remove_running_expire_tok | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | restore_database | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | sa_die | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | seen_delete | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | seen_get | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | seen_put | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | set_last_expire | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | set_running_expire_tok | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | sync | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | sync_due | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | tie_db_readonly | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | tie_db_writable | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | tok_count_change | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | tok_get | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | tok_get_all | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | tok_touch | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | tok_touch_all | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | token_expiration | Mail::SpamAssassin::BayesStore::
0 | 0 | 0 | 0s | 0s | untie_db | Mail::SpamAssassin::BayesStore::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # <@LICENSE> | ||||
2 | # Licensed to the Apache Software Foundation (ASF) under one or more | ||||
3 | # contributor license agreements. See the NOTICE file distributed with | ||||
4 | # this work for additional information regarding copyright ownership. | ||||
5 | # The ASF licenses this file to you under the Apache License, Version 2.0 | ||||
6 | # (the "License"); you may not use this file except in compliance with | ||||
7 | # the License. You may obtain a copy of the License at: | ||||
8 | # | ||||
9 | # http://www.apache.org/licenses/LICENSE-2.0 | ||||
10 | # | ||||
11 | # Unless required by applicable law or agreed to in writing, software | ||||
12 | # distributed under the License is distributed on an "AS IS" BASIS, | ||||
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
14 | # See the License for the specific language governing permissions and | ||||
15 | # limitations under the License. | ||||
16 | # </@LICENSE> | ||||
17 | |||||
18 | =head1 NAME | ||||
19 | |||||
20 | Mail::SpamAssassin::BayesStore - Storage Module for default Bayes classifier | ||||
21 | |||||
22 | =head1 DESCRIPTION | ||||
23 | |||||
24 | This is the public API for the Bayesian store methods. Any implementation of | ||||
25 | the storage module for the default Bayes classifier must implement these methods. | ||||
26 | |||||
27 | =cut | ||||
28 | |||||
29 | package Mail::SpamAssassin::BayesStore; | ||||
30 | |||||
31 | 2 | 60µs | 2 | 73µs | # spent 64µs (56+8) within Mail::SpamAssassin::BayesStore::BEGIN@31 which was called:
# once (56µs+8µs) by Mail::SpamAssassin::BayesStore::DBM::BEGIN@38 at line 31 # spent 64µs making 1 call to Mail::SpamAssassin::BayesStore::BEGIN@31
# spent 8µs making 1 call to strict::import |
32 | 2 | 53µs | 2 | 68µs | # spent 44µs (21+23) within Mail::SpamAssassin::BayesStore::BEGIN@32 which was called:
# once (21µs+23µs) by Mail::SpamAssassin::BayesStore::DBM::BEGIN@38 at line 32 # spent 44µs making 1 call to Mail::SpamAssassin::BayesStore::BEGIN@32
# spent 23µs making 1 call to warnings::import |
33 | 2 | 54µs | 2 | 34µs | # spent 29µs (24+5) within Mail::SpamAssassin::BayesStore::BEGIN@33 which was called:
# once (24µs+5µs) by Mail::SpamAssassin::BayesStore::DBM::BEGIN@38 at line 33 # spent 29µs making 1 call to Mail::SpamAssassin::BayesStore::BEGIN@33
# spent 5µs making 1 call to bytes::import |
34 | 2 | 58µs | 2 | 116µs | # spent 68µs (19+49) within Mail::SpamAssassin::BayesStore::BEGIN@34 which was called:
# once (19µs+49µs) by Mail::SpamAssassin::BayesStore::DBM::BEGIN@38 at line 34 # spent 68µs making 1 call to Mail::SpamAssassin::BayesStore::BEGIN@34
# spent 49µs making 1 call to re::import |
35 | 2 | 4.73ms | 2 | 282µs | # spent 153µs (25+128) within Mail::SpamAssassin::BayesStore::BEGIN@35 which was called:
# once (25µs+128µs) by Mail::SpamAssassin::BayesStore::DBM::BEGIN@38 at line 35 # spent 153µs making 1 call to Mail::SpamAssassin::BayesStore::BEGIN@35
# spent 128µs making 1 call to Exporter::import |
36 | |||||
37 | # TODO: if we ever get tuits, it'd be good to make these POD | ||||
38 | # method docs more perlish... hardly a biggie. | ||||
39 | |||||
40 | =head1 METHODS | ||||
41 | |||||
42 | =over 4 | ||||
43 | |||||
44 | =item new | ||||
45 | |||||
46 | public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Plugin::Bayes $bayes) | ||||
47 | |||||
48 | Description: | ||||
49 | This method creates a new instance of the Mail::SpamAssassin::BayesStore | ||||
50 | object. You must pass in an instance of the Mail::SpamAssassin::Plugin::Bayes | ||||
51 | object, which is stashed for use throughout the module. | ||||
52 | |||||
53 | =cut | ||||
54 | |||||
55 | # spent 21µs within Mail::SpamAssassin::BayesStore::new which was called:
# once (21µs+0s) by Mail::SpamAssassin::BayesStore::DBM::new at line 125 of Mail/SpamAssassin/BayesStore/DBM.pm | ||||
56 | 1 | 3µs | my ($class, $bayes) = @_; | ||
57 | |||||
58 | 1 | 2µs | $class = ref($class) || $class; | ||
59 | |||||
60 | 1 | 6µs | my $self = { | ||
61 | 'bayes' => $bayes, | ||||
62 | 'supported_db_version' => 0, | ||||
63 | 'db_version' => undef, | ||||
64 | }; | ||||
65 | |||||
66 | 1 | 3µs | bless ($self, $class); | ||
67 | |||||
68 | 1 | 14µs | $self; | ||
69 | } | ||||
70 | |||||
71 | =item DB_VERSION | ||||
72 | |||||
73 | public instance (Integer) DB_VERSION () | ||||
74 | |||||
75 | Description: | ||||
76 | This method returns the currently supported database version for the | ||||
77 | implementation. | ||||
78 | |||||
79 | =cut | ||||
80 | |||||
81 | # spent 2.34ms within Mail::SpamAssassin::BayesStore::DB_VERSION which was called 234 times, avg 10µs/call:
# 233 times (2.33ms+0s) by Mail::SpamAssassin::BayesStore::DBM::_check_db_version at line 352 of Mail/SpamAssassin/BayesStore/DBM.pm, avg 10µs/call
# once (9µs+0s) by Mail::SpamAssassin::BayesStore::DBM::tie_db_writable at line 320 of Mail/SpamAssassin/BayesStore/DBM.pm | ||||
82 | 234 | 561µs | my ($self) = @_; | ||
83 | 234 | 2.08ms | return $self->{supported_db_version}; | ||
84 | } | ||||
85 | |||||
86 | =item read_db_configs | ||||
87 | |||||
88 | public instance () read_db_configs () | ||||
89 | |||||
90 | Description: | ||||
91 | This method reads any needed config variables from the configuration object | ||||
92 | and then calls the Mail::SpamAssassin::Plugin::Bayes read_db_configs method. | ||||
93 | |||||
94 | =cut | ||||
95 | |||||
96 | # spent 12.1ms (9.77+2.35) within Mail::SpamAssassin::BayesStore::read_db_configs which was called 236 times, avg 51µs/call:
# 235 times (9.74ms+2.33ms) by Mail::SpamAssassin::BayesStore::DBM::tie_db_readonly at line 156 of Mail/SpamAssassin/BayesStore/DBM.pm, avg 51µs/call
# once (33µs+14µs) by Mail::SpamAssassin::BayesStore::DBM::tie_db_writable at line 254 of Mail/SpamAssassin/BayesStore/DBM.pm | ||||
97 | 236 | 563µs | my ($self) = @_; | ||
98 | |||||
99 | # TODO: at some stage, this may be useful to read config items which | ||||
100 | # control database bloat, like | ||||
101 | # | ||||
102 | # - use of hapaxes | ||||
103 | # - use of case-sensitivity | ||||
104 | # - more midrange-hapax-avoidance tactics when parsing headers (future) | ||||
105 | # | ||||
106 | # for now, we just set these settings statically. | ||||
107 | 236 | 976µs | my $conf = $self->{bayes}->{main}->{conf}; | ||
108 | |||||
109 | # Minimum desired database size? Expiry will not shrink the | ||||
110 | # database below this number of entries. 100k entries is roughly | ||||
111 | # equivalent to a 5Mb database file. | ||||
112 | 236 | 966µs | $self->{expiry_max_db_size} = $conf->{bayes_expiry_max_db_size}; | ||
113 | 236 | 1.19ms | $self->{expiry_pct} = $conf->{bayes_expiry_pct}; | ||
114 | 236 | 975µs | $self->{expiry_period} = $conf->{bayes_expiry_period}; | ||
115 | 236 | 832µs | $self->{expiry_max_exponent} = $conf->{bayes_expiry_max_exponent}; | ||
116 | |||||
117 | 236 | 3.92ms | 236 | 2.35ms | $self->{bayes}->read_db_configs(); # spent 2.35ms making 236 calls to Mail::SpamAssassin::Plugin::Bayes::read_db_configs, avg 10µs/call |
118 | } | ||||
119 | |||||
120 | =item prefork_init | ||||
121 | |||||
122 | public instance (Boolean) prefork_init () | ||||
123 | |||||
124 | Description: | ||||
125 | This optional method is called in the parent process shortly before | ||||
126 | forking off child processes. | ||||
127 | |||||
128 | =cut | ||||
129 | |||||
130 | # sub prefork_init { | ||||
131 | # my ($self) = @_; | ||||
132 | # } | ||||
133 | |||||
134 | =item spamd_child_init | ||||
135 | |||||
136 | public instance (Boolean) spamd_child_init () | ||||
137 | |||||
138 | Description: | ||||
139 | This optional method is called in a child process shortly after being spawned. | ||||
140 | |||||
141 | =cut | ||||
142 | |||||
143 | # sub spamd_child_init { | ||||
144 | # my ($self) = @_; | ||||
145 | # } | ||||
146 | |||||
147 | =item tie_db_readonly | ||||
148 | |||||
149 | public instance (Boolean) tie_db_readonly () | ||||
150 | |||||
151 | Description: | ||||
152 | This method opens up the database in readonly mode. | ||||
153 | |||||
154 | =cut | ||||
155 | |||||
156 | sub tie_db_readonly { | ||||
157 | my ($self) = @_; | ||||
158 | die "bayes: tie_db_readonly: not implemented\n"; | ||||
159 | } | ||||
160 | |||||
161 | =item tie_db_writable | ||||
162 | |||||
163 | public instance (Boolean) tie_db_writable () | ||||
164 | |||||
165 | Description: | ||||
166 | This method opens up the database in writable mode. | ||||
167 | |||||
168 | Any callers of this methods should ensure that they call untie_db() | ||||
169 | afterwards. | ||||
170 | |||||
171 | =cut | ||||
172 | |||||
173 | sub tie_db_writable { | ||||
174 | my ($self) = @_; | ||||
175 | die "bayes: tie_db_writable: not implemented\n"; | ||||
176 | } | ||||
177 | |||||
178 | =item untie_db | ||||
179 | |||||
180 | public instance () untie_db () | ||||
181 | |||||
182 | Description: | ||||
183 | This method unties the database. | ||||
184 | |||||
185 | =cut | ||||
186 | |||||
187 | sub untie_db { | ||||
188 | my $self = shift; | ||||
189 | die "bayes: untie_db: not implemented\n"; | ||||
190 | } | ||||
191 | |||||
192 | =item calculate_expire_delta | ||||
193 | |||||
194 | public instance (%) calculate_expire_delta (Integer $newest_atime, | ||||
195 | Integer $start, | ||||
196 | Integer $max_expire_mult) | ||||
197 | |||||
198 | Description: | ||||
199 | This method performs a calculation on the data to determine the optimum | ||||
200 | atime for token expiration. | ||||
201 | |||||
202 | =cut | ||||
203 | |||||
204 | sub calculate_expire_delta { | ||||
205 | my ($self, $newest_atime, $start, $max_expire_mult) = @_; | ||||
206 | die "bayes: calculate_expire_delta: not implemented\n"; | ||||
207 | } | ||||
208 | |||||
209 | =item token_expiration | ||||
210 | |||||
211 | public instance (Integer, Integer, | ||||
212 | Integer, Integer) token_expiration(\% $opts, | ||||
213 | Integer $newest_atime, | ||||
214 | Integer $newdelta) | ||||
215 | |||||
216 | Description: | ||||
217 | This method performs the database specific expiration of tokens based on | ||||
218 | the passed in C<$newest_atime> and C<$newdelta>. | ||||
219 | |||||
220 | =cut | ||||
221 | |||||
222 | sub token_expiration { | ||||
223 | my ($self, $opts, $newest_atime, $newdelta) = @_; | ||||
224 | die "bayes: token_expiration: not implemented\n"; | ||||
225 | } | ||||
226 | |||||
227 | =item expire_old_tokens | ||||
228 | |||||
229 | public instance (Boolean) expire_old_tokens (\% hashref) | ||||
230 | |||||
231 | Description: | ||||
232 | This method expires old tokens from the database. | ||||
233 | |||||
234 | =cut | ||||
235 | |||||
236 | sub expire_old_tokens { | ||||
237 | my ($self, $opts) = @_; | ||||
238 | my $ret; | ||||
239 | |||||
240 | my $eval_stat; | ||||
241 | eval { | ||||
242 | local $SIG{'__DIE__'}; # do not run user die() traps in here | ||||
243 | if ($self->tie_db_writable()) { | ||||
244 | $ret = $self->expire_old_tokens_trapped ($opts); | ||||
245 | } | ||||
246 | 1; | ||||
247 | } or do { | ||||
248 | $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; | ||||
249 | }; | ||||
250 | |||||
251 | if (!$self->{bayes}->{main}->{learn_caller_will_untie}) { | ||||
252 | $self->untie_db(); | ||||
253 | } | ||||
254 | |||||
255 | if (defined $eval_stat) { # if we died, untie the dbs. | ||||
256 | warn "bayes: expire_old_tokens: $eval_stat\n"; | ||||
257 | return 0; | ||||
258 | } | ||||
259 | $ret; | ||||
260 | } | ||||
261 | |||||
262 | =item expire_old_tokens_trapped | ||||
263 | |||||
264 | public instance (Boolean) expire_old_tokens_trapped (\% $opts) | ||||
265 | |||||
266 | Description: | ||||
267 | This methods does the actual token expiration. | ||||
268 | |||||
269 | XXX More docs here about the methodology and what not | ||||
270 | |||||
271 | =cut | ||||
272 | |||||
273 | sub expire_old_tokens_trapped { | ||||
274 | my ($self, $opts) = @_; | ||||
275 | |||||
276 | # Flag that we're doing work | ||||
277 | $self->set_running_expire_tok(); | ||||
278 | |||||
279 | # We don't need to do an expire, so why were we called? Oh well. | ||||
280 | if (!$self->expiry_due()) { | ||||
281 | $self->remove_running_expire_tok(); | ||||
282 | return 0; | ||||
283 | } | ||||
284 | |||||
285 | my $started = time(); | ||||
286 | my @vars = $self->get_storage_variables(); | ||||
287 | |||||
288 | if ( $vars[10] > time ) { | ||||
289 | dbg("bayes: expiry found newest atime in the future, resetting to current time"); | ||||
290 | $vars[10] = time; | ||||
291 | } | ||||
292 | |||||
293 | # How many tokens do we want to keep? | ||||
294 | my $goal_reduction = int($self->{expiry_max_db_size} * $self->{expiry_pct}); | ||||
295 | dbg("bayes: expiry check keep size, ".$self->{expiry_pct}." * max: $goal_reduction"); | ||||
296 | # Make sure we keep at least 100000 tokens in the DB | ||||
297 | if ( $goal_reduction < 100000 ) { | ||||
298 | $goal_reduction = 100000; | ||||
299 | dbg("bayes: expiry keep size too small, resetting to 100,000 tokens"); | ||||
300 | } | ||||
301 | # Now turn goal_reduction into how many to expire. | ||||
302 | $goal_reduction = $vars[3] - $goal_reduction; | ||||
303 | dbg("bayes: token count: ".$vars[3].", final goal reduction size: $goal_reduction"); | ||||
304 | |||||
305 | if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort. | ||||
306 | dbg("bayes: reduction goal of $goal_reduction is under 1,000 tokens, skipping expire"); | ||||
307 | $self->set_last_expire(time()); | ||||
308 | $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now. | ||||
309 | return 1; # we want to indicate things ran as expected | ||||
310 | } | ||||
311 | |||||
312 | # Estimate new atime delta based on the last atime delta | ||||
313 | my $newdelta = 0; | ||||
314 | if ( $vars[9] > 0 ) { | ||||
315 | # newdelta = olddelta * old / goal; | ||||
316 | # this may seem backwards, but since we're talking delta here, | ||||
317 | # not actual atime, we want smaller atimes to expire more tokens, | ||||
318 | # and visa versa. | ||||
319 | # | ||||
320 | $newdelta = int($vars[8] * $vars[9] / $goal_reduction); | ||||
321 | } | ||||
322 | |||||
323 | # Calculate size difference between last expiration token removal | ||||
324 | # count and the current goal removal count. | ||||
325 | my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9]; | ||||
326 | |||||
327 | dbg("bayes: first pass? current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio, period: ".$self->{expiry_period}); | ||||
328 | |||||
329 | ## ESTIMATION PHASE | ||||
330 | # | ||||
331 | # Do this for the first expire or "odd" looking results cause a first pass to determine atime: | ||||
332 | # | ||||
333 | # - last expire was more than 30 days ago | ||||
334 | # assume mail flow stays roughly the same month to month, recompute if it's > 1 month | ||||
335 | # - last atime delta was under expiry period | ||||
336 | # if we're expiring often max_db_size should go up, but let's recompute just to check | ||||
337 | # - last reduction count was < 1000 tokens | ||||
338 | # ditto | ||||
339 | # - new estimated atime delta is under expiry period | ||||
340 | # ditto | ||||
341 | # - difference of last reduction to current goal reduction is > 50% | ||||
342 | # if the two values are out of balance, estimating atime is going to be funky, recompute | ||||
343 | # | ||||
344 | if ( (time() - $vars[4] > 86400*30) || ($vars[8] < $self->{expiry_period}) || ($vars[9] < 1000) | ||||
345 | || ($newdelta < $self->{expiry_period}) || ($ratio > 1.5) ) { | ||||
346 | dbg("bayes: can't use estimation method for expiry, unexpected result, calculating optimal atime delta (first pass)"); | ||||
347 | |||||
348 | my $start = $self->{expiry_period}; # exponential search starting at ...? 1/2 day, 1, 2, 4, 8, 16, ... | ||||
349 | my $max_expire_mult = 2**$self->{expiry_max_exponent}; # $max_expire_mult * $start = max expire time (256 days), power of 2. | ||||
350 | |||||
351 | dbg("bayes: expiry max exponent: ".$self->{expiry_max_exponent}); | ||||
352 | |||||
353 | my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult); | ||||
354 | |||||
355 | return 0 unless (%delta); | ||||
356 | |||||
357 | # This will skip the for loop if debugging isn't enabled ... | ||||
358 | if (would_log('dbg', 'bayes')) { | ||||
359 | dbg("bayes: atime\ttoken reduction"); | ||||
360 | dbg("bayes: ========\t==============="); | ||||
361 | for(my $i = 1; $i<=$max_expire_mult; $i <<= 1) { | ||||
362 | dbg("bayes: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0)); | ||||
363 | } | ||||
364 | } | ||||
365 | |||||
366 | # Now figure out which max_expire_mult value gives the closest results to goal_reduction, without | ||||
367 | # going over ... Go from the largest delta backwards so the reduction size increases | ||||
368 | # (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...) | ||||
369 | # | ||||
370 | for( ; $max_expire_mult > 0; $max_expire_mult>>=1 ) { | ||||
371 | next unless exists $delta{$max_expire_mult}; | ||||
372 | if ($delta{$max_expire_mult} > $goal_reduction) { | ||||
373 | $max_expire_mult<<=1; # the max expire is actually the next power of 2 out | ||||
374 | last; | ||||
375 | } | ||||
376 | } | ||||
377 | |||||
378 | # if max_expire_mult gets to 0, either we can't expire anything, or 1 is <= $goal_reduction | ||||
379 | $max_expire_mult ||= 1; | ||||
380 | |||||
381 | # $max_expire_mult is now equal to the value we should use ... | ||||
382 | # Check to see if the atime value we found is really good. | ||||
383 | # It's not good if: | ||||
384 | # - $max_expire_mult would not expire any tokens. This means that the majority of | ||||
385 | # tokens are old or new, and more activity is required before an expiry can occur. | ||||
386 | # - reduction count < 1000, not enough tokens to be worth doing an expire. | ||||
387 | # | ||||
388 | if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) { | ||||
389 | dbg("bayes: couldn't find a good delta atime, need more token difference, skipping expire"); | ||||
390 | $self->set_last_expire(time()); | ||||
391 | $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now. | ||||
392 | return 1; # we want to indicate things ran as expected | ||||
393 | } | ||||
394 | |||||
395 | $newdelta = $start * $max_expire_mult; | ||||
396 | dbg("bayes: first pass decided on $newdelta for atime delta"); | ||||
397 | } | ||||
398 | else { # use the estimation method | ||||
399 | dbg("bayes: can do estimation method for expiry, skipping first pass"); | ||||
400 | } | ||||
401 | |||||
402 | my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars); | ||||
403 | |||||
404 | my $done = time(); | ||||
405 | |||||
406 | my $msg = "expired old bayes database entries in ".($done - $started)." seconds"; | ||||
407 | my $msg2 = "$kept entries kept, $deleted deleted"; | ||||
408 | |||||
409 | if ($opts->{verbose}) { | ||||
410 | my $hapax_pc = ($num_hapaxes * 100) / $kept; | ||||
411 | my $lowfreq_pc = ($num_lowfreq * 100) / $kept; | ||||
412 | print "$msg\n$msg2\n" or die "Error writing: $!"; | ||||
413 | printf "token frequency: 1-occurrence tokens: %3.2f%%\n", $hapax_pc | ||||
414 | or die "Error writing: $!"; | ||||
415 | printf "token frequency: less than 8 occurrences: %3.2f%%\n", $lowfreq_pc | ||||
416 | or die "Error writing: $!"; | ||||
417 | } | ||||
418 | else { | ||||
419 | dbg("bayes: $msg: $msg2"); | ||||
420 | } | ||||
421 | |||||
422 | return 1; | ||||
423 | } | ||||
424 | |||||
425 | =item sync_due | ||||
426 | |||||
427 | public instance (Boolean) sync_due () | ||||
428 | |||||
429 | Description: | ||||
430 | This methods determines if a sync is due. | ||||
431 | |||||
432 | =cut | ||||
433 | |||||
434 | sub sync_due { | ||||
435 | my ($self) = @_; | ||||
436 | die "bayes: sync_due: not implemented\n"; | ||||
437 | } | ||||
438 | |||||
439 | =item expiry_due | ||||
440 | |||||
441 | public instance (Boolean) expiry_due () | ||||
442 | |||||
443 | Description: | ||||
444 | This methods determines if an expire is due. | ||||
445 | |||||
446 | =cut | ||||
447 | |||||
448 | sub expiry_due { | ||||
449 | my ($self) = @_; | ||||
450 | |||||
451 | $self->read_db_configs(); # make sure this has happened here | ||||
452 | |||||
453 | # If force expire was called, do the expire no matter what. | ||||
454 | return 1 if ($self->{bayes}->{main}->{learn_force_expire}); | ||||
455 | |||||
456 | # if config says not to auto expire then no need to continue | ||||
457 | return 0 if ($self->{bayes}->{main}->{conf}->{bayes_auto_expire} == 0); | ||||
458 | |||||
459 | # is the database too small for expiry? (Do *not* use "scalar keys", | ||||
460 | # as this will iterate through the entire db counting them!) | ||||
461 | my @vars = $self->get_storage_variables(); | ||||
462 | my $ntoks = $vars[3]; | ||||
463 | |||||
464 | my $last_expire = time() - $vars[4]; | ||||
465 | if (!$self->{bayes}->{main}->{ignore_safety_expire_timeout}) { | ||||
466 | # if we're not ignoring the safety timeout, don't run an expire more | ||||
467 | # than once every 12 hours. | ||||
468 | return 0 if ($last_expire < 43200); | ||||
469 | } | ||||
470 | else { | ||||
471 | # if we are ignoring the safety timeout (e.g.: mass-check), still | ||||
472 | # limit the expiry to only one every 5 minutes. | ||||
473 | return 0 if ($last_expire < 300); | ||||
474 | } | ||||
475 | |||||
476 | dbg("bayes: DB expiry: tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time()); | ||||
477 | |||||
478 | my $conf = $self->{bayes}->{main}->{conf}; | ||||
479 | if ($ntoks <= 100000 || # keep at least 100k tokens | ||||
480 | $self->{expiry_max_db_size} > $ntoks || # not enough tokens to cause an expire | ||||
481 | $vars[10]-$vars[5] < 43200 || # delta between oldest and newest < 12h | ||||
482 | $self->{db_version} < $self->DB_VERSION # ignore old db formats | ||||
483 | ) { | ||||
484 | return 0; | ||||
485 | } | ||||
486 | |||||
487 | return 1; | ||||
488 | } | ||||
489 | |||||
490 | =item seen_get | ||||
491 | |||||
492 | public instance (Char) seen_get (String $msgid) | ||||
493 | |||||
494 | Description: | ||||
495 | This method retrieves the stored value, if any, for C<$msgid>. The return | ||||
496 | value is the stored string ('s' for spam and 'h' for ham) or undef if | ||||
497 | C<$msgid> is not found. | ||||
498 | |||||
499 | =cut | ||||
500 | |||||
501 | sub seen_get { | ||||
502 | my ($self, $msgid) = @_; | ||||
503 | die "bayes: seen_get: not implemented\n"; | ||||
504 | } | ||||
505 | |||||
506 | =item seen_put | ||||
507 | |||||
508 | public instance (Boolean) seen_put (String $msgid, Char $flag) | ||||
509 | |||||
510 | Description: | ||||
511 | This method records C<$msgid> as the type given by C<$flag>. C<$flag> is | ||||
512 | one of two values 's' for spam and 'h' for ham. | ||||
513 | |||||
514 | =cut | ||||
515 | |||||
516 | sub seen_put { | ||||
517 | my ($self, $msgid, $flag) = @_; | ||||
518 | die "bayes: seen_put: not implemented\n"; | ||||
519 | } | ||||
520 | |||||
521 | =item seen_delete | ||||
522 | |||||
523 | public instance (Boolean) seen_delete (String $msgid) | ||||
524 | |||||
525 | Description: | ||||
526 | This method removes C<$msgid> from storage. | ||||
527 | |||||
528 | =cut | ||||
529 | |||||
530 | sub seen_delete { | ||||
531 | my ($self, $msgid) = @_; | ||||
532 | die "bayes: seen_delete: not implemented\n"; | ||||
533 | } | ||||
534 | |||||
535 | =item get_storage_variables | ||||
536 | |||||
537 | public instance (@) get_storage_variables () | ||||
538 | |||||
539 | Description: | ||||
540 | This method retrieves the various administrative variables used by | ||||
541 | the Bayes storage implementation. | ||||
542 | |||||
543 | The values returned in the array are in the following order: | ||||
544 | |||||
545 | 0: scan count base | ||||
546 | |||||
547 | 1: number of spam | ||||
548 | |||||
549 | 2: number of ham | ||||
550 | |||||
551 | 3: number of tokens in db | ||||
552 | |||||
553 | 4: last expire atime | ||||
554 | |||||
555 | 5: oldest token in db atime | ||||
556 | |||||
557 | 6: db version value | ||||
558 | |||||
559 | 7: last journal sync | ||||
560 | |||||
561 | 8: last atime delta | ||||
562 | |||||
563 | 9: last expire reduction count | ||||
564 | |||||
565 | 10: newest token in db atime | ||||
566 | |||||
567 | =cut | ||||
568 | |||||
569 | sub get_storage_variables { | ||||
570 | my ($self) = @_; | ||||
571 | die "bayes: get_storage_variables: not implemented\n"; | ||||
572 | } | ||||
573 | |||||
574 | =item dump_db_toks | ||||
575 | |||||
576 | public instance () dump_db_toks (String $template, String $regex, @ @vars) | ||||
577 | |||||
578 | Description: | ||||
579 | This method loops over all tokens, computing the probability for the token | ||||
580 | and then printing it out according to the passed in template. | ||||
581 | |||||
582 | =cut | ||||
583 | |||||
584 | sub dump_db_toks { | ||||
585 | my ($self, $template, $regex, @vars) = @_; | ||||
586 | die "bayes: dump_db_toks: not implemented\n"; | ||||
587 | } | ||||
588 | |||||
589 | =item set_last_expire | ||||
590 | |||||
591 | public instance (Boolean) _set_last_expire (Integer $time) | ||||
592 | |||||
593 | Description: | ||||
594 | This method sets the last expire time. | ||||
595 | |||||
596 | =cut | ||||
597 | |||||
598 | sub set_last_expire { | ||||
599 | my ($self, $time) = @_; | ||||
600 | die "bayes: set_last_expire: not implemented\n"; | ||||
601 | } | ||||
602 | |||||
603 | =item get_running_expire_tok | ||||
604 | |||||
605 | public instance (Time) get_running_expire_tok () | ||||
606 | |||||
607 | Description: | ||||
608 | This method determines if an expire is currently running and returns the time | ||||
609 | the expire started. | ||||
610 | |||||
611 | =cut | ||||
612 | |||||
613 | sub get_running_expire_tok { | ||||
614 | my ($self) = @_; | ||||
615 | die "bayes: get_running_expire_tok: not implemented\n"; | ||||
616 | } | ||||
617 | |||||
618 | =item set_running_expire_tok | ||||
619 | |||||
620 | public instance (Time) set_running_expire_tok () | ||||
621 | |||||
622 | Description: | ||||
623 | This method sets the running expire time to the current time. | ||||
624 | |||||
625 | =cut | ||||
626 | |||||
627 | sub set_running_expire_tok { | ||||
628 | my ($self) = @_; | ||||
629 | die "bayes: set_running_expire_tok: not implemented\n"; | ||||
630 | } | ||||
631 | |||||
632 | =item remove_running_expire_tok | ||||
633 | |||||
634 | public instance (Boolean) remove_running_expire_tok () | ||||
635 | |||||
636 | Description: | ||||
637 | This method removes a currently set running expire time. | ||||
638 | |||||
639 | =cut | ||||
640 | |||||
641 | sub remove_running_expire_tok { | ||||
642 | my ($self) = @_; | ||||
643 | die "bayes: remove_running_expire_tok: not implemented\n"; | ||||
644 | } | ||||
645 | |||||
646 | =item tok_get | ||||
647 | |||||
648 | public instance (Integer, Integer, Time) tok_get (String $token) | ||||
649 | |||||
650 | Description: | ||||
651 | This method retrieves the specified token (C<$token>) from storage and returns | ||||
652 | it's spam count, ham acount and last access time. | ||||
653 | |||||
654 | =cut | ||||
655 | |||||
656 | sub tok_get { | ||||
657 | my ($self, $token) = @_; | ||||
658 | die "bayes: tok_get: not implemented\n"; | ||||
659 | } | ||||
660 | |||||
661 | =item tok_get_all | ||||
662 | |||||
663 | public instance (\@) tok_get_all (@ @tokens) | ||||
664 | |||||
665 | Description: | ||||
666 | This method retrieves the specified tokens (C<@tokens>) from storage and | ||||
667 | returns an array ref of arrays spam count, ham count and last access time. | ||||
668 | |||||
669 | =cut | ||||
670 | |||||
671 | sub tok_get_all { | ||||
672 | my ($self, $tokens) = @_; | ||||
673 | die "bayes: tok_get_all: not implemented\n"; | ||||
674 | } | ||||
675 | |||||
676 | =item tok_count_change | ||||
677 | |||||
678 | public instance (Boolean) tok_count_change (Integer $spam_count, | ||||
679 | Integer $ham_count, | ||||
680 | String $token, | ||||
681 | Time $atime) | ||||
682 | |||||
683 | Description: | ||||
684 | This method takes a C<$spam_count> and C<$ham_count> and adds it to | ||||
685 | C<$token> along with updating C<$token>s atime with C<$atime>. | ||||
686 | |||||
687 | =cut | ||||
688 | |||||
689 | sub tok_count_change { | ||||
690 | my ($self, $spam_count, $ham_count, $token, $atime) = @_; | ||||
691 | die "bayes: tok_count_change: not implemented\n"; | ||||
692 | } | ||||
693 | |||||
694 | =item multi_tok_count_change | ||||
695 | |||||
696 | public instance (Boolean) multi_tok_count_change (Integer $spam_count, | ||||
697 | Integer $ham_count, | ||||
698 | \% $tokens, | ||||
699 | String $atime) | ||||
700 | |||||
701 | Description: | ||||
702 | This method takes a C<$spam_count> and C<$ham_count> and adds it to all | ||||
703 | of the tokens in the C<$tokens> hash ref along with updating each tokens | ||||
704 | atime with C<$atime>. | ||||
705 | |||||
706 | =cut | ||||
707 | |||||
708 | sub multi_tok_count_change { | ||||
709 | my ($self, $spam_count, $ham_count, $tokens, $atime) = @_; | ||||
710 | die "bayes: multi_tok_count_change: not implemented\n"; | ||||
711 | } | ||||
712 | |||||
713 | =item nspam_nham_get | ||||
714 | |||||
715 | public instance (Integer, Integer) nspam_nham_get () | ||||
716 | |||||
717 | Description: | ||||
718 | This method retrieves the total number of spam and the total number of ham | ||||
719 | currently under storage. | ||||
720 | |||||
721 | =cut | ||||
722 | |||||
723 | sub nspam_nham_get { | ||||
724 | my ($self) = @_; | ||||
725 | die "bayes: nspam_nham_get: not implemented\n"; | ||||
726 | } | ||||
727 | |||||
728 | =item nspam_nham_change | ||||
729 | |||||
730 | public instance (Boolean) nspam_nham_change (Integer $num_spam, | ||||
731 | Integer $num_ham) | ||||
732 | |||||
733 | Description: | ||||
734 | This method updates the number of spam and the number of ham in the database. | ||||
735 | |||||
736 | =cut | ||||
737 | |||||
738 | sub nspam_nham_change { | ||||
739 | my ($self, $num_spam, $num_ham) = @_; | ||||
740 | die "bayes: nspam_nham_change: not implemented\n"; | ||||
741 | } | ||||
742 | |||||
743 | =item tok_touch | ||||
744 | |||||
745 | public instance (Boolean) tok_touch (String $token, | ||||
746 | Time $atime) | ||||
747 | |||||
748 | Description: | ||||
749 | This method updates the given tokens (C<$token>) access time. | ||||
750 | |||||
751 | =cut | ||||
752 | |||||
753 | sub tok_touch { | ||||
754 | my ($self, $token, $atime) = @_; | ||||
755 | die "bayes: tok_touch: not implemented\n"; | ||||
756 | } | ||||
757 | |||||
758 | =item tok_touch_all | ||||
759 | |||||
760 | public instance (Boolean) tok_touch_all (\@ $tokens, | ||||
761 | Time $atime) | ||||
762 | |||||
763 | Description: | ||||
764 | This method does a mass update of the given list of tokens C<$tokens>, if the existing token | ||||
765 | atime is < C<$atime>. | ||||
766 | |||||
767 | =cut | ||||
768 | |||||
769 | sub tok_touch_all { | ||||
770 | my ($self, $tokens, $atime) = @_; | ||||
771 | die "bayes: tok_touch_all: not implemented\n"; | ||||
772 | } | ||||
773 | |||||
774 | =item cleanup | ||||
775 | |||||
776 | public instance (Boolean) cleanup () | ||||
777 | |||||
778 | Description: | ||||
779 | This method performs any cleanup necessary before moving onto the next | ||||
780 | operation. | ||||
781 | |||||
782 | =cut | ||||
783 | |||||
784 | sub cleanup { | ||||
785 | my ($self) = @_; | ||||
786 | die "bayes: cleanup: not implemented\n"; | ||||
787 | } | ||||
788 | |||||
789 | =item get_magic_re | ||||
790 | |||||
791 | public instance get_magic_re (String) | ||||
792 | |||||
793 | Description: | ||||
794 | This method returns a regexp which indicates a magic token. | ||||
795 | |||||
796 | =cut | ||||
797 | |||||
798 | sub get_magic_re { | ||||
799 | my ($self) = @_; | ||||
800 | die "bayes: get_magic_re: not implemented\n"; | ||||
801 | } | ||||
802 | |||||
803 | =item sync | ||||
804 | |||||
805 | public instance (Boolean) sync (\% $opts) | ||||
806 | |||||
807 | Description: | ||||
808 | This method performs a sync of the database. | ||||
809 | |||||
810 | =cut | ||||
811 | |||||
812 | sub sync { | ||||
813 | my ($self, $opts) = @_; | ||||
814 | die "bayes: sync: not implemented\n"; | ||||
815 | } | ||||
816 | |||||
817 | =item perform_upgrade | ||||
818 | |||||
819 | public instance (Boolean) perform_upgrade (\% $opts) | ||||
820 | |||||
821 | Description: | ||||
822 | This method is a utility method that performs any necessary upgrades | ||||
823 | between versions. It should know how to handle previous versions and | ||||
824 | what needs to happen to upgrade them. | ||||
825 | |||||
826 | A true return value indicates success. | ||||
827 | |||||
828 | =cut | ||||
829 | |||||
830 | sub perform_upgrade { | ||||
831 | my ($self, $opts) = @_; | ||||
832 | die "bayes: perform_upgrade: not implemented\n"; | ||||
833 | } | ||||
834 | |||||
835 | =item clear_database | ||||
836 | |||||
837 | public instance (Boolean) clear_database () | ||||
838 | |||||
839 | Description: | ||||
840 | This method deletes all records for a particular user. | ||||
841 | |||||
842 | Callers should be aware that any errors returned by this method | ||||
843 | could causes the database to be inconsistent for the given user. | ||||
844 | |||||
845 | =cut | ||||
846 | |||||
847 | sub clear_database { | ||||
848 | my ($self) = @_; | ||||
849 | die "bayes: clear_database: not implemented\n"; | ||||
850 | } | ||||
851 | |||||
852 | =item backup_database | ||||
853 | |||||
854 | public instance (Boolean) backup_database () | ||||
855 | |||||
856 | Description: | ||||
857 | This method will dump the users database in a machine readable format. | ||||
858 | |||||
859 | =cut | ||||
860 | |||||
861 | sub backup_database { | ||||
862 | my ($self) = @_; | ||||
863 | die "bayes: backup_database: not implemented\n"; | ||||
864 | } | ||||
865 | |||||
866 | =item restore_database | ||||
867 | |||||
868 | public instance (Boolean) restore_database (String $filename, Boolean $showdots) | ||||
869 | |||||
870 | Description: | ||||
871 | This method restores a database from the given filename, C<$filename>. | ||||
872 | |||||
873 | Callers should be aware that any errors returned by this method | ||||
874 | could causes the database to be inconsistent for the given user. | ||||
875 | |||||
876 | =cut | ||||
877 | |||||
878 | sub restore_database { | ||||
879 | my ($self, $filename, $showdots) = @_; | ||||
880 | die "bayes: restore_database: not implemented\n"; | ||||
881 | } | ||||
882 | |||||
883 | =item db_readable | ||||
884 | |||||
885 | public instance (Boolean) db_readable () | ||||
886 | |||||
887 | Description: | ||||
888 | This method returns whether or not the Bayes DB is available in a | ||||
889 | readable state. | ||||
890 | |||||
891 | =cut | ||||
892 | |||||
893 | sub db_readable { | ||||
894 | my ($self) = @_; | ||||
895 | die "bayes: db_readable: not implemented\n"; | ||||
896 | } | ||||
897 | |||||
898 | =item db_writable | ||||
899 | |||||
900 | public instance (Boolean) db_writable () | ||||
901 | |||||
902 | Description: | ||||
903 | This method returns whether or not the Bayes DB is available in a | ||||
904 | writable state. | ||||
905 | |||||
906 | =cut | ||||
907 | |||||
908 | sub db_writable { | ||||
909 | my ($self) = @_; | ||||
910 | die "bayes: db_writable: not implemented\n"; | ||||
911 | } | ||||
912 | |||||
913 | |||||
914 | sub sa_die { Mail::SpamAssassin::sa_die(@_); } | ||||
915 | |||||
916 | 1 | 8µs | 1; | ||
917 | |||||
918 | =back | ||||
919 | |||||
920 | =cut |