← Index
NYTProf Performance Profile   « line view »
For /usr/local/bin/sa-learn
  Run on Sun Nov 5 03:09:29 2017
Reported on Mon Nov 6 13:20:48 2017

Filename/usr/local/lib/perl5/site_perl/Mail/SpamAssassin/Bayes/CombineChi.pm
StatementsExecuted 19 statements in 1.18ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
11147µs55µsMail::SpamAssassin::Plugin::Bayes::::BEGIN@20 Mail::SpamAssassin::Plugin::Bayes::BEGIN@20
11131µs36µsMail::SpamAssassin::Bayes::Combine::::BEGIN@33Mail::SpamAssassin::Bayes::Combine::BEGIN@33
11125µs32µsMail::SpamAssassin::Bayes::Combine::::BEGIN@31Mail::SpamAssassin::Bayes::Combine::BEGIN@31
11124µs177µsMail::SpamAssassin::Bayes::Combine::::BEGIN@36Mail::SpamAssassin::Bayes::Combine::BEGIN@36
11122µs219µsMail::SpamAssassin::Bayes::Combine::::BEGIN@37Mail::SpamAssassin::Bayes::Combine::BEGIN@37
11120µs42µsMail::SpamAssassin::Bayes::Combine::::BEGIN@32Mail::SpamAssassin::Bayes::Combine::BEGIN@32
11118µs66µsMail::SpamAssassin::Bayes::Combine::::BEGIN@34Mail::SpamAssassin::Bayes::Combine::BEGIN@34
0000s0sMail::SpamAssassin::Bayes::Combine::::chi2qMail::SpamAssassin::Bayes::Combine::chi2q
0000s0sMail::SpamAssassin::Bayes::Combine::::combineMail::SpamAssassin::Bayes::Combine::combine
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1# Chi-square probability combining and related constants.
2#
3# <@LICENSE>
4# Licensed to the Apache Software Foundation (ASF) under one or more
5# contributor license agreements. See the NOTICE file distributed with
6# this work for additional information regarding copyright ownership.
7# The ASF licenses this file to you under the Apache License, Version 2.0
8# (the "License"); you may not use this file except in compliance with
9# the License. You may obtain a copy of the License at:
10#
11# http://www.apache.org/licenses/LICENSE-2.0
12#
13# Unless required by applicable law or agreed to in writing, software
14# distributed under the License is distributed on an "AS IS" BASIS,
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16# See the License for the specific language governing permissions and
17# limitations under the License.
18# </@LICENSE>
19
202114µs263µs
# spent 55µs (47+8) within Mail::SpamAssassin::Plugin::Bayes::BEGIN@20 which was called: # once (47µs+8µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 20
use strict; # make Test::Perl::Critic happy
# spent 55µs making 1 call to Mail::SpamAssassin::Plugin::Bayes::BEGIN@20 # spent 8µs making 1 call to strict::import
21
22# this package is a no-op; the real impl code is in another pkg.
23package Mail::SpamAssassin::Bayes::CombineChi; 1;
24
25# Force into another package, so our symbols will appear in that namespace with
26# no indirection, for speed. Other combiners must do the same, since Bayes.pm
27# uses this namespace directly. This means only one combiner can be loaded at
28# any time.
29package Mail::SpamAssassin::Bayes::Combine;
30
31252µs238µs
# spent 32µs (25+7) within Mail::SpamAssassin::Bayes::Combine::BEGIN@31 which was called: # once (25µs+7µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 31
use strict;
# spent 32µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@31 # spent 7µs making 1 call to strict::import
32252µs265µs
# spent 42µs (20+22) within Mail::SpamAssassin::Bayes::Combine::BEGIN@32 which was called: # once (20µs+22µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 32
use warnings;
# spent 42µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@32 # spent 22µs making 1 call to warnings::import
33261µs241µs
# spent 36µs (31+5) within Mail::SpamAssassin::Bayes::Combine::BEGIN@33 which was called: # once (31µs+5µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 33
use bytes;
# spent 36µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@33 # spent 5µs making 1 call to bytes::import
34257µs2114µs
# spent 66µs (18+48) within Mail::SpamAssassin::Bayes::Combine::BEGIN@34 which was called: # once (18µs+48µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 34
use re 'taint';
# spent 66µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@34 # spent 48µs making 1 call to re::import
35
36283µs2329µs
# spent 177µs (24+153) within Mail::SpamAssassin::Bayes::Combine::BEGIN@36 which was called: # once (24µs+153µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 36
use POSIX qw(frexp);
# spent 177µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@36 # spent 152µs making 1 call to POSIX::import
372742µs2416µs
# spent 219µs (22+197) within Mail::SpamAssassin::Bayes::Combine::BEGIN@37 which was called: # once (22µs+197µs) by Mail::SpamAssassin::Plugin::Bayes::BEGIN@63 at line 37
use constant LN2 => log(2);
# spent 219µs making 1 call to Mail::SpamAssassin::Bayes::Combine::BEGIN@37 # spent 197µs making 1 call to constant::import
38
39# Value for 'x' in Gary Robinson's f(w) equation.
40# "Let x = the number used when n [hits] is 0."
4112µsour $FW_X_CONSTANT = 0.538;
42
43# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
44# the use of "s") of an original assumed expectation ... relative to how
45# strongly we want to consider our actual collected data." Low 's' means
46# trust collected data more strongly.
4712µsour $FW_S_CONSTANT = 0.030;
48
49# (s . x) for the f(w) equation.
5013µsour $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
51
52# Should we ignore tokens with probs very close to the middle ground (.5)?
53# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
5411µsour $MIN_PROB_STRENGTH = 0.346;
55
56###########################################################################
57
58# Chi-Squared method. Produces mostly boolean $result,
59# but with a grey area.
60sub combine {
61 my ($ns, $nn, $sortedref) = @_;
62
63 # @$sortedref contains an array of the probabilities
64 my $wc = scalar @$sortedref;
65 return unless $wc;
66
67 my ($H, $S);
68 my ($Hexp, $Sexp);
69 $Hexp = $Sexp = 0;
70
71 # see bug 3118
72 my $totmsgs = ($ns + $nn);
73 if ($totmsgs == 0) { return; }
74 $S = ($ns / $totmsgs);
75 $H = ($nn / $totmsgs);
76
77 foreach my $prob (@$sortedref) {
78 $S *= 1.0 - $prob;
79 $H *= $prob;
80 if ($S < 1e-200) {
81 my $e;
82 ($S, $e) = frexp($S);
83 $Sexp += $e;
84 }
85 if ($H < 1e-200) {
86 my $e;
87 ($H, $e) = frexp($H);
88 $Hexp += $e;
89 }
90 }
91
92 $S = log($S) + $Sexp * LN2;
93 $H = log($H) + $Hexp * LN2;
94
95 # note: previous versions used (2 * $wc) as second arg ($v), but the chi2q()
96 # fn then just used ($v/2) internally! changed to simply supply $wc as
97 # ($halfv) directly instead to avoid redundant doubling and halving. The
98 # side-effect is that chi2q() uses a different API now, but it's only used
99 # here anyway.
100
101 $S = 1.0 - chi2q(-2.0 * $S, $wc);
102 $H = 1.0 - chi2q(-2.0 * $H, $wc);
103 return (($S - $H) + 1.0) / 2.0;
104}
105
106# Chi-squared function (API changed; see comment above)
107sub chi2q {
108 my ($x2, $halfv) = @_;
109
110 my $m = $x2 / 2.0;
111 my ($sum, $term);
112 $sum = $term = exp(0 - $m);
113
114 # replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp
115 # array, with a plain C-style for loop
116 my $i;
117 for ($i = 1; $i < $halfv; $i++) {
118 $term *= $m / $i;
119 $sum += $term;
120 }
121 return $sum < 1.0 ? $sum : 1.0;
122}
123
124111µs1;