Filename | /usr/local/lib/perl5/site_perl/Mail/SpamAssassin/Bayes/CombineChi.pm |
Statements | Executed 19 statements in 1.29ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 49µs | 127µs | BEGIN@34 | Mail::SpamAssassin::Bayes::Combine::
1 | 1 | 1 | 44µs | 57µs | BEGIN@20 | Mail::SpamAssassin::Plugin::Bayes::
1 | 1 | 1 | 30µs | 36µs | BEGIN@31 | Mail::SpamAssassin::Bayes::Combine::
1 | 1 | 1 | 29µs | 34µs | BEGIN@33 | Mail::SpamAssassin::Bayes::Combine::
1 | 1 | 1 | 26µs | 205µs | BEGIN@37 | Mail::SpamAssassin::Bayes::Combine::
1 | 1 | 1 | 25µs | 196µs | BEGIN@36 | Mail::SpamAssassin::Bayes::Combine::
1 | 1 | 1 | 22µs | 49µs | BEGIN@32 | Mail::SpamAssassin::Bayes::Combine::
0 | 0 | 0 | 0s | 0s | chi2q | Mail::SpamAssassin::Bayes::Combine::
0 | 0 | 0 | 0s | 0s | combine | Mail::SpamAssassin::Bayes::Combine::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # Chi-square probability combining and related constants. | ||||
2 | # | ||||
3 | # <@LICENSE> | ||||
4 | # Licensed to the Apache Software Foundation (ASF) under one or more | ||||
5 | # contributor license agreements. See the NOTICE file distributed with | ||||
6 | # this work for additional information regarding copyright ownership. | ||||
7 | # The ASF licenses this file to you under the Apache License, Version 2.0 | ||||
8 | # (the "License"); you may not use this file except in compliance with | ||||
9 | # the License. You may obtain a copy of the License at: | ||||
10 | # | ||||
11 | # http://www.apache.org/licenses/LICENSE-2.0 | ||||
12 | # | ||||
13 | # Unless required by applicable law or agreed to in writing, software | ||||
14 | # distributed under the License is distributed on an "AS IS" BASIS, | ||||
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
16 | # See the License for the specific language governing permissions and | ||||
17 | # limitations under the License. | ||||
18 | # </@LICENSE> | ||||
19 | |||||
20 | 2 | 126µs | 2 | 70µs | # spent 57µs (44+13) within Mail::SpamAssassin::Plugin::Bayes::BEGIN@20 which was called:
# once (44µs+13µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 20 # spent 57µs making 1 call to Mail::SpamAssassin::Plugin::Bayes::BEGIN@20
# spent 13µs making 1 call to strict::import |
21 | |||||
22 | # this package is a no-op; the real impl code is in another pkg. | ||||
23 | package Mail::SpamAssassin::Bayes::CombineChi; 1; | ||||
24 | |||||
25 | # Force into another package, so our symbols will appear in that namespace with | ||||
26 | # no indirection, for speed. Other combiners must do the same, since Bayes.pm | ||||
27 | # uses this namespace directly. This means only one combiner can be loaded at | ||||
28 | # any time. | ||||
29 | package Mail::SpamAssassin::Bayes::Combine; | ||||
30 | |||||
31 | 2 | 69µs | 2 | 43µs | # spent 36µs (30+7) within Mail::SpamAssassin::Bayes::Combine::BEGIN@31 which was called:
# once (30µs+7µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 31 # spent 36µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@31
# spent 6µs making 1 call to strict::import |
32 | 2 | 78µs | 2 | 75µs | # spent 49µs (22+27) within Mail::SpamAssassin::Bayes::Combine::BEGIN@32 which was called:
# once (22µs+27µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 32 # spent 49µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@32
# spent 27µs making 1 call to warnings::import |
33 | 2 | 55µs | 2 | 39µs | # spent 34µs (29+5) within Mail::SpamAssassin::Bayes::Combine::BEGIN@33 which was called:
# once (29µs+5µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 33 # spent 34µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@33
# spent 5µs making 1 call to bytes::import |
34 | 2 | 74µs | 2 | 204µs | # spent 127µs (49+78) within Mail::SpamAssassin::Bayes::Combine::BEGIN@34 which was called:
# once (49µs+78µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 34 # spent 127µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@34
# spent 78µs making 1 call to re::import |
35 | |||||
36 | 2 | 98µs | 2 | 366µs | # spent 196µs (25+170) within Mail::SpamAssassin::Bayes::Combine::BEGIN@36 which was called:
# once (25µs+170µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 36 # spent 196µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@36
# spent 170µs making 1 call to POSIX::import |
37 | 2 | 751µs | 2 | 384µs | # spent 205µs (26+179) within Mail::SpamAssassin::Bayes::Combine::BEGIN@37 which was called:
# once (26µs+179µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 37 # spent 205µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@37
# spent 179µs making 1 call to constant::import |
38 | |||||
39 | # Value for 'x' in Gary Robinson's f(w) equation. | ||||
40 | # "Let x = the number used when n [hits] is 0." | ||||
41 | 1 | 2µs | our $FW_X_CONSTANT = 0.538; | ||
42 | |||||
43 | # Value for 's' in the f(w) equation. "We can see s as the "strength" (hence | ||||
44 | # the use of "s") of an original assumed expectation ... relative to how | ||||
45 | # strongly we want to consider our actual collected data." Low 's' means | ||||
46 | # trust collected data more strongly. | ||||
47 | 1 | 2µs | our $FW_S_CONSTANT = 0.030; | ||
48 | |||||
49 | # (s . x) for the f(w) equation. | ||||
50 | 1 | 12µs | our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT); | ||
51 | |||||
52 | # Should we ignore tokens with probs very close to the middle ground (.5)? | ||||
53 | # tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used. | ||||
54 | 1 | 2µs | our $MIN_PROB_STRENGTH = 0.346; | ||
55 | |||||
56 | ########################################################################### | ||||
57 | |||||
58 | # Chi-Squared method. Produces mostly boolean $result, | ||||
59 | # but with a grey area. | ||||
60 | sub combine { | ||||
61 | my ($ns, $nn, $sortedref) = @_; | ||||
62 | |||||
63 | # @$sortedref contains an array of the probabilities | ||||
64 | my $wc = scalar @$sortedref; | ||||
65 | return unless $wc; | ||||
66 | |||||
67 | my ($H, $S); | ||||
68 | my ($Hexp, $Sexp); | ||||
69 | $Hexp = $Sexp = 0; | ||||
70 | |||||
71 | # see bug 3118 | ||||
72 | my $totmsgs = ($ns + $nn); | ||||
73 | if ($totmsgs == 0) { return; } | ||||
74 | $S = ($ns / $totmsgs); | ||||
75 | $H = ($nn / $totmsgs); | ||||
76 | |||||
77 | foreach my $prob (@$sortedref) { | ||||
78 | $S *= 1.0 - $prob; | ||||
79 | $H *= $prob; | ||||
80 | if ($S < 1e-200) { | ||||
81 | my $e; | ||||
82 | ($S, $e) = frexp($S); | ||||
83 | $Sexp += $e; | ||||
84 | } | ||||
85 | if ($H < 1e-200) { | ||||
86 | my $e; | ||||
87 | ($H, $e) = frexp($H); | ||||
88 | $Hexp += $e; | ||||
89 | } | ||||
90 | } | ||||
91 | |||||
92 | $S = log($S) + $Sexp * LN2; | ||||
93 | $H = log($H) + $Hexp * LN2; | ||||
94 | |||||
95 | # note: previous versions used (2 * $wc) as second arg ($v), but the chi2q() | ||||
96 | # fn then just used ($v/2) internally! changed to simply supply $wc as | ||||
97 | # ($halfv) directly instead to avoid redundant doubling and halving. The | ||||
98 | # side-effect is that chi2q() uses a different API now, but it's only used | ||||
99 | # here anyway. | ||||
100 | |||||
101 | $S = 1.0 - chi2q(-2.0 * $S, $wc); | ||||
102 | $H = 1.0 - chi2q(-2.0 * $H, $wc); | ||||
103 | return (($S - $H) + 1.0) / 2.0; | ||||
104 | } | ||||
105 | |||||
106 | # Chi-squared function (API changed; see comment above) | ||||
107 | sub chi2q { | ||||
108 | my ($x2, $halfv) = @_; | ||||
109 | |||||
110 | my $m = $x2 / 2.0; | ||||
111 | my ($sum, $term); | ||||
112 | $sum = $term = exp(0 - $m); | ||||
113 | |||||
114 | # replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp | ||||
115 | # array, with a plain C-style for loop | ||||
116 | my $i; | ||||
117 | for ($i = 1; $i < $halfv; $i++) { | ||||
118 | $term *= $m / $i; | ||||
119 | $sum += $term; | ||||
120 | } | ||||
121 | return $sum < 1.0 ? $sum : 1.0; | ||||
122 | } | ||||
123 | |||||
124 | 1 | 21µs | 1; |