← Index
NYTProf Performance Profile   « line view »
For /usr/local/bin/sa-learn
  Run on Sun Nov 5 03:09:29 2017
Reported on Mon Nov 6 13:20:49 2017

Filename/usr/local/lib/perl5/site_perl/Mail/SpamAssassin/Plugin/BodyEval.pm
StatementsExecuted 40 statements in 3.50ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
111107µs253µsMail::SpamAssassin::Plugin::BodyEval::::newMail::SpamAssassin::Plugin::BodyEval::new
1011101µs101µsMail::SpamAssassin::Plugin::BodyEval::::has_check_body_lengthMail::SpamAssassin::Plugin::BodyEval::has_check_body_length
11134µs34µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@20Mail::SpamAssassin::Plugin::BodyEval::BEGIN@20
11123µs172µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@21Mail::SpamAssassin::Plugin::BodyEval::BEGIN@21
11122µs647µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@22Mail::SpamAssassin::Plugin::BodyEval::BEGIN@22
11122µs28µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@26Mail::SpamAssassin::Plugin::BodyEval::BEGIN@26
11121µs31µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@24Mail::SpamAssassin::Plugin::BodyEval::BEGIN@24
11121µs85µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@29Mail::SpamAssassin::Plugin::BodyEval::BEGIN@29
11121µs48µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@25Mail::SpamAssassin::Plugin::BodyEval::BEGIN@25
11120µs77µsMail::SpamAssassin::Plugin::BodyEval::::BEGIN@27Mail::SpamAssassin::Plugin::BodyEval::BEGIN@27
0000s0sMail::SpamAssassin::Plugin::BodyEval::::_check_stock_infoMail::SpamAssassin::Plugin::BodyEval::_check_stock_info
0000s0sMail::SpamAssassin::Plugin::BodyEval::::_multipart_alternative_differenceMail::SpamAssassin::Plugin::BodyEval::_multipart_alternative_difference
0000s0sMail::SpamAssassin::Plugin::BodyEval::::check_blank_line_ratioMail::SpamAssassin::Plugin::BodyEval::check_blank_line_ratio
0000s0sMail::SpamAssassin::Plugin::BodyEval::::check_body_lengthMail::SpamAssassin::Plugin::BodyEval::check_body_length
0000s0sMail::SpamAssassin::Plugin::BodyEval::::check_stock_infoMail::SpamAssassin::Plugin::BodyEval::check_stock_info
0000s0sMail::SpamAssassin::Plugin::BodyEval::::multipart_alternative_differenceMail::SpamAssassin::Plugin::BodyEval::multipart_alternative_difference
0000s0sMail::SpamAssassin::Plugin::BodyEval::::multipart_alternative_difference_countMail::SpamAssassin::Plugin::BodyEval::multipart_alternative_difference_count
0000s0sMail::SpamAssassin::Plugin::BodyEval::::tvd_vertical_wordsMail::SpamAssassin::Plugin::BodyEval::tvd_vertical_words
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1# <@LICENSE>
2# Licensed to the Apache Software Foundation (ASF) under one or more
3# contributor license agreements. See the NOTICE file distributed with
4# this work for additional information regarding copyright ownership.
5# The ASF licenses this file to you under the Apache License, Version 2.0
6# (the "License"); you may not use this file except in compliance with
7# the License. You may obtain a copy of the License at:
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16# </@LICENSE>
17
18package Mail::SpamAssassin::Plugin::BodyEval;
19
20260µs134µs
# spent 34µs within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@20 which was called: # once (34µs+0s) by Mail::SpamAssassin::PluginHandler::load_plugin at line 20
use Mail::SpamAssassin::Plugin;
# spent 34µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@20
21265µs2320µs
# spent 172µs (23+148) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@21 which was called: # once (23µs+148µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 21
use Mail::SpamAssassin::Logger;
# spent 172µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@21 # spent 148µs making 1 call to Exporter::import
22262µs21.27ms
# spent 647µs (22+624) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@22 which was called: # once (22µs+624µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 22
use Mail::SpamAssassin::Constants qw(:sa);
# spent 647µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@22 # spent 624µs making 1 call to Exporter::import
23
24253µs240µs
# spent 31µs (21+10) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@24 which was called: # once (21µs+10µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 24
use strict;
# spent 31µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@24 # spent 10µs making 1 call to strict::import
25258µs276µs
# spent 48µs (21+28) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@25 which was called: # once (21µs+28µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 25
use warnings;
# spent 48µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@25 # spent 28µs making 1 call to warnings::import
26256µs235µs
# spent 28µs (22+7) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@26 which was called: # once (22µs+7µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 26
use bytes;
# spent 28µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@26 # spent 7µs making 1 call to bytes::import
27262µs2133µs
# spent 77µs (20+56) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@27 which was called: # once (20µs+56µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 27
use re 'taint';
# spent 77µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@27 # spent 56µs making 1 call to re::import
28
2922.91ms2148µs
# spent 85µs (21+64) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@29 which was called: # once (21µs+64µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 29
use vars qw(@ISA);
# spent 85µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@29 # spent 64µs making 1 call to vars::import
30121µs@ISA = qw(Mail::SpamAssassin::Plugin);
31
32# constructor: register the eval rule
33
# spent 253µs (107+146) within Mail::SpamAssassin::Plugin::BodyEval::new which was called: # once (107µs+146µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 1 of (eval 91)[Mail/SpamAssassin/PluginHandler.pm:129]
sub new {
3412µs my $class = shift;
3512µs my $mailsaobject = shift;
36
37 # some boilerplate...
3812µs $class = ref($class) || $class;
39111µs119µs my $self = $class->SUPER::new($mailsaobject);
# spent 19µs making 1 call to Mail::SpamAssassin::Plugin::new
4012µs bless ($self, $class);
41
42 # the important bit!
43112µs132µs $self->register_eval_rule("multipart_alternative_difference");
# spent 32µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule
4416µs120µs $self->register_eval_rule("multipart_alternative_difference_count");
# spent 20µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule
4516µs119µs $self->register_eval_rule("check_blank_line_ratio");
# spent 19µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule
4616µs118µs $self->register_eval_rule("tvd_vertical_words");
# spent 18µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule
4716µs118µs $self->register_eval_rule("check_stock_info");
# spent 18µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule
4816µs118µs $self->register_eval_rule("check_body_length");
# spent 18µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule
49
5019µs return $self;
51}
52
53sub multipart_alternative_difference {
54 my ($self, $pms, $fulltext, $min, $max) = @_;
55
56 $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
57
58 if (($min == 0 || $pms->{madiff} > $min) &&
59 ($max eq "undef" || $pms->{madiff} <= $max)) {
60 return 1;
61 }
62 return 0;
63}
64
65sub multipart_alternative_difference_count {
66 my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
67 $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
68 return 0 unless $pms->{madiff_html} > $minhtml;
69 return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
70}
71
72sub _multipart_alternative_difference {
73 my ($self, $pms) = @_;
74 $pms->{madiff} = 0;
75 $pms->{madiff_html} = 0;
76 $pms->{madiff_text} = 0;
77
78 my $msg = $pms->{msg};
79
80 # Find all multipart/alternative parts in the message
81 my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);
82
83 # If there are no multipart/alternative sections, skip this test.
84 return if (!@ma);
85
86 # Figure out what the MIME content of the message looks like
87 my @content = $msg->content_summary();
88
89 # Exchange meeting requests come in as m/a text/html text/calendar,
90 # which we want to ignore because of the high FP rate it would cause.
91 #
92 if (@content == 3 && $content[2] eq 'text/calendar' &&
93 $content[1] eq 'text/html' &&
94 $content[0] eq 'multipart/alternative') {
95 return;
96 }
97
98 # Go through each of the multipart parts
99 foreach my $part (@ma) {
100 my %html;
101 my %text;
102
103 # limit our search to text-based parts
104 my @txt = $part->find_parts(qr@^text\b@i);
105 foreach my $text (@txt) {
106 # we only care about the rendered version of the part
107 my ($type, $rnd) = $text->rendered();
108 next unless defined $type;
109
110 # parse the rendered text into tokens. assume they are whitespace
111 # separated, and ignore anything that doesn't have a word-character
112 # in it (0-9a-zA-Z_) since those are probably things like bullet
113 # points, horizontal lines, etc. this assumes that punctuation
114 # in one part will be the same in other parts.
115 #
116 if ($type eq 'text/html') {
117 foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
118 #dbg("eval: HTML: $w");
119 $html{$w}++;
120 }
121
122 # If there are no words, mark if there's at least 1 image ...
123 if (!%html && exists $pms->{html}{inside}{img}) {
124 # Use "\n" as the mark since it can't ever occur normally
125 $html{"\n"}=1;
126 }
127 }
128 else {
129 foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
130 #dbg("eval: TEXT: $w");
131 $text{$w}++;
132 }
133 }
134 }
135
136 # How many HTML tokens do we have at the start?
137 my $orig = keys %html;
138 next if ($orig == 0);
139
140 $pms->{madiff_html} = $orig;
141 $pms->{madiff_text} = keys %text;
142 dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});
143
144 # If the token appears at least as many times in the text part as
145 # in the html part, remove it from the list of html tokens.
146 while(my ($k,$v) = each %text) {
147 delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
148 }
149
150 #map { dbg("eval: LEFT: $_") } keys %html;
151
152 # In theory, the tokens should be the same in both text and html
153 # parts, so there would be 0 tokens left in the html token list, for
154 # a 0% difference rate. Calculate it here, and record the difference
155 # if it's been the highest so far in this message.
156 my $diff = scalar(keys %html)/$orig*100;
157 $pms->{madiff} = $diff if ($diff > $pms->{madiff});
158
159 dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
160 }
161
162 return;
163}
164
165sub check_blank_line_ratio {
166 my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;
167
168 if (!defined $minlines || $minlines < 1) {
169 $minlines = 1;
170 }
171
172 my $blank_line_ratio_ref = $pms->{blank_line_ratio};
173
174 if (! exists $blank_line_ratio_ref->{$minlines}) {
175 $fulltext = $pms->get_decoded_body_text_array();
176
177 my $blank = 0;
178 my $nlines = 0;
179 foreach my $chunk (@$fulltext) {
180 foreach (split(/^/m, $chunk, -1)) {
181 $nlines++;
182 $blank++ if !/\S/;
183 }
184 }
185
186 # report -1 if it's a blank message ...
187 $blank_line_ratio_ref->{$minlines} =
188 $nlines < $minlines ? -1 : 100 * $blank / $nlines;
189 }
190
191 return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) ||
192 ($blank_line_ratio_ref->{$minlines} > $min &&
193 $blank_line_ratio_ref->{$minlines} <= $max));
194}
195
196sub tvd_vertical_words {
197 my ($self, $pms, $text, $min, $max) = @_;
198
199 # klugy
200 $max = 101 if ($max >= 100);
201
202 if (!defined $pms->{tvd_vertical_words}) {
203 $pms->{tvd_vertical_words} = -1;
204
205 foreach (@{$text}) {
206 my $l = length $_;
207 next unless ($l > 5);
208 my $spaces = tr/ / /;
209 my $nonspaces = $l - $spaces;
210 my $pct;
211 if ($spaces > $nonspaces || $nonspaces == 0) {
212 $pct = 100;
213 }
214 else {
215 $pct = int(100*$spaces/$nonspaces);
216 }
217 $pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
218 }
219 }
220
221 dbg("eval: tvd_vertical_words value: $pms->{tvd_vertical_words} / min: $min / max: $max - value must be >= min and < max");
222 return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
223}
224
225sub check_stock_info {
226 my ($self, $pms, $fulltext, $min) = @_;
227
228 $self->_check_stock_info($pms) unless (exists $pms->{stock_info});
229
230 if ($min == 0 || $pms->{stock_info} >= $min) {
231 return 1;
232 }
233 return 0;
234}
235
236sub _check_stock_info {
237 my ($self, $pms) = @_;
238 $pms->{stock_info} = 0;
239
240 # Find all multipart/alternative parts in the message
241 my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i);
242 return if (!@parts);
243
244 # Go through each of the multipart parts
245 my %hits;
246 my $part = $parts[0];
247 my ($type, $rnd) = $part->rendered();
248 return unless $type;
249
250 # bug 5644,5717: avoid pathological cases where a regexp takes massive amount
251 # of time by applying the regexp to limited-size text chunks, one at a time
252
253 foreach my $rnd_chunk (
254 Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd))
255 {
256 foreach ( $rnd_chunk =~ /^\s*([^:\s][^:\n]{2,29})\s*:\s*\S/mg ) {
257 my $str = lc $_;
258 $str =~ tr/a-z//cd;
259 #$str =~ s/([a-z])0([a-z])/$1o$2/g;
260
261 if ($str =~ /(
262 ^trad(?:e|ing)date|
263 company(?:name)?|
264 s\w?(?:t\w?o\w?c\w?k|y\w?m(?:\w?b\w?o\w?l)?)|
265 t(?:arget|icker)|
266 (?:opening|current)p(?:rice)?|
267 p(?:rojected|osition)|
268 expectations|
269 weeks?high|
270 marketperformance|
271 (?:year|week|month|day|price)(?:target|estimates?)|
272 sector|
273 r(?:ecommendation|ating)
274 )$/x) {
275 $hits{$1}++;
276 dbg("eval: stock info hit: $1");
277 }
278 }
279 }
280
281 $pms->{stock_info} = scalar keys %hits;
282 dbg("eval: stock info total: ".$pms->{stock_info});
283
284 return;
285}
286
287sub check_body_length {
288 my ($self, $pms, undef, $min) = @_;
289
290 my $body_length = $pms->{msg}->{pristine_body_length};
291 dbg("eval: body_length - %s - check for min of %s", $body_length, $min);
292
293 return (defined $body_length && $body_length <= $min) ? 1 : 0;
294}
295
296# ---------------------------------------------------------------------------
297
298# capability checks for "if can()":
299#
3001081µs
# spent 101µs within Mail::SpamAssassin::Plugin::BodyEval::has_check_body_length which was called 10 times, avg 10µs/call: # 10 times (101µs+0s) by Mail::SpamAssassin::Conf::Parser::cond_clause_can_or_has at line 595 of Mail/SpamAssassin/Conf/Parser.pm, avg 10µs/call
sub has_check_body_length { 1 }
301
30218µs1;