Filename | /usr/local/lib/perl5/site_perl/Mail/SpamAssassin/Plugin/BodyEval.pm |
Statements | Executed 40 statements in 3.94ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 127µs | 339µs | new | Mail::SpamAssassin::Plugin::BodyEval::
10 | 1 | 1 | 96µs | 96µs | has_check_body_length | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 42µs | 129µs | BEGIN@27 | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 42µs | 42µs | BEGIN@20 | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 42µs | 274µs | BEGIN@21 | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 36µs | 106µs | BEGIN@29 | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 31µs | 40µs | BEGIN@24 | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 30µs | 854µs | BEGIN@22 | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 28µs | 33µs | BEGIN@26 | Mail::SpamAssassin::Plugin::BodyEval::
1 | 1 | 1 | 28µs | 60µs | BEGIN@25 | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | _check_stock_info | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | _multipart_alternative_difference | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | check_blank_line_ratio | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | check_body_length | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | check_stock_info | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | multipart_alternative_difference | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | multipart_alternative_difference_count | Mail::SpamAssassin::Plugin::BodyEval::
0 | 0 | 0 | 0s | 0s | tvd_vertical_words | Mail::SpamAssassin::Plugin::BodyEval::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # <@LICENSE> | ||||
2 | # Licensed to the Apache Software Foundation (ASF) under one or more | ||||
3 | # contributor license agreements. See the NOTICE file distributed with | ||||
4 | # this work for additional information regarding copyright ownership. | ||||
5 | # The ASF licenses this file to you under the Apache License, Version 2.0 | ||||
6 | # (the "License"); you may not use this file except in compliance with | ||||
7 | # the License. You may obtain a copy of the License at: | ||||
8 | # | ||||
9 | # http://www.apache.org/licenses/LICENSE-2.0 | ||||
10 | # | ||||
11 | # Unless required by applicable law or agreed to in writing, software | ||||
12 | # distributed under the License is distributed on an "AS IS" BASIS, | ||||
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
14 | # See the License for the specific language governing permissions and | ||||
15 | # limitations under the License. | ||||
16 | # </@LICENSE> | ||||
17 | |||||
18 | package Mail::SpamAssassin::Plugin::BodyEval; | ||||
19 | |||||
20 | 2 | 70µs | 1 | 42µs | # spent 42µs within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@20 which was called:
# once (42µs+0s) by Mail::SpamAssassin::PluginHandler::load_plugin at line 20 # spent 42µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@20 |
21 | 2 | 96µs | 2 | 506µs | # spent 274µs (42+232) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@21 which was called:
# once (42µs+232µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 21 # spent 274µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@21
# spent 232µs making 1 call to Exporter::import |
22 | 2 | 80µs | 2 | 1.68ms | # spent 854µs (30+824) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@22 which was called:
# once (30µs+824µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 22 # spent 854µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@22
# spent 824µs making 1 call to Exporter::import |
23 | |||||
24 | 2 | 75µs | 2 | 48µs | # spent 40µs (31+8) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@24 which was called:
# once (31µs+8µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 24 # spent 40µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@24
# spent 8µs making 1 call to strict::import |
25 | 2 | 68µs | 2 | 93µs | # spent 60µs (28+33) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@25 which was called:
# once (28µs+33µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 25 # spent 60µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@25
# spent 33µs making 1 call to warnings::import |
26 | 2 | 82µs | 2 | 38µs | # spent 33µs (28+5) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@26 which was called:
# once (28µs+5µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 26 # spent 33µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@26
# spent 5µs making 1 call to bytes::import |
27 | 2 | 92µs | 2 | 215µs | # spent 129µs (42+86) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@27 which was called:
# once (42µs+86µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 27 # spent 129µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@27
# spent 86µs making 1 call to re::import |
28 | |||||
29 | 2 | 3.18ms | 2 | 176µs | # spent 106µs (36+70) within Mail::SpamAssassin::Plugin::BodyEval::BEGIN@29 which was called:
# once (36µs+70µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 29 # spent 106µs making 1 call to Mail::SpamAssassin::Plugin::BodyEval::BEGIN@29
# spent 70µs making 1 call to vars::import |
30 | 1 | 26µs | @ISA = qw(Mail::SpamAssassin::Plugin); | ||
31 | |||||
32 | # constructor: register the eval rule | ||||
33 | # spent 339µs (127+212) within Mail::SpamAssassin::Plugin::BodyEval::new which was called:
# once (127µs+212µs) by Mail::SpamAssassin::PluginHandler::load_plugin at line 1 of (eval 91)[Mail/SpamAssassin/PluginHandler.pm:129] | ||||
34 | 1 | 2µs | my $class = shift; | ||
35 | 1 | 2µs | my $mailsaobject = shift; | ||
36 | |||||
37 | # some boilerplate... | ||||
38 | 1 | 2µs | $class = ref($class) || $class; | ||
39 | 1 | 16µs | 1 | 26µs | my $self = $class->SUPER::new($mailsaobject); # spent 26µs making 1 call to Mail::SpamAssassin::Plugin::new |
40 | 1 | 2µs | bless ($self, $class); | ||
41 | |||||
42 | # the important bit! | ||||
43 | 1 | 22µs | 1 | 33µs | $self->register_eval_rule("multipart_alternative_difference"); # spent 33µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule |
44 | 1 | 6µs | 1 | 30µs | $self->register_eval_rule("multipart_alternative_difference_count"); # spent 30µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule |
45 | 1 | 6µs | 1 | 29µs | $self->register_eval_rule("check_blank_line_ratio"); # spent 29µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule |
46 | 1 | 6µs | 1 | 35µs | $self->register_eval_rule("tvd_vertical_words"); # spent 35µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule |
47 | 1 | 6µs | 1 | 28µs | $self->register_eval_rule("check_stock_info"); # spent 28µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule |
48 | 1 | 6µs | 1 | 31µs | $self->register_eval_rule("check_body_length"); # spent 31µs making 1 call to Mail::SpamAssassin::Plugin::register_eval_rule |
49 | |||||
50 | 1 | 17µs | return $self; | ||
51 | } | ||||
52 | |||||
53 | sub multipart_alternative_difference { | ||||
54 | my ($self, $pms, $fulltext, $min, $max) = @_; | ||||
55 | |||||
56 | $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff}); | ||||
57 | |||||
58 | if (($min == 0 || $pms->{madiff} > $min) && | ||||
59 | ($max eq "undef" || $pms->{madiff} <= $max)) { | ||||
60 | return 1; | ||||
61 | } | ||||
62 | return 0; | ||||
63 | } | ||||
64 | |||||
65 | sub multipart_alternative_difference_count { | ||||
66 | my ($self, $pms, $fulltext, $ratio, $minhtml) = @_; | ||||
67 | $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff}); | ||||
68 | return 0 unless $pms->{madiff_html} > $minhtml; | ||||
69 | return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio); | ||||
70 | } | ||||
71 | |||||
72 | sub _multipart_alternative_difference { | ||||
73 | my ($self, $pms) = @_; | ||||
74 | $pms->{madiff} = 0; | ||||
75 | $pms->{madiff_html} = 0; | ||||
76 | $pms->{madiff_text} = 0; | ||||
77 | |||||
78 | my $msg = $pms->{msg}; | ||||
79 | |||||
80 | # Find all multipart/alternative parts in the message | ||||
81 | my @ma = $msg->find_parts(qr@^multipart/alternative\b@i); | ||||
82 | |||||
83 | # If there are no multipart/alternative sections, skip this test. | ||||
84 | return if (!@ma); | ||||
85 | |||||
86 | # Figure out what the MIME content of the message looks like | ||||
87 | my @content = $msg->content_summary(); | ||||
88 | |||||
89 | # Exchange meeting requests come in as m/a text/html text/calendar, | ||||
90 | # which we want to ignore because of the high FP rate it would cause. | ||||
91 | # | ||||
92 | if (@content == 3 && $content[2] eq 'text/calendar' && | ||||
93 | $content[1] eq 'text/html' && | ||||
94 | $content[0] eq 'multipart/alternative') { | ||||
95 | return; | ||||
96 | } | ||||
97 | |||||
98 | # Go through each of the multipart parts | ||||
99 | foreach my $part (@ma) { | ||||
100 | my %html; | ||||
101 | my %text; | ||||
102 | |||||
103 | # limit our search to text-based parts | ||||
104 | my @txt = $part->find_parts(qr@^text\b@i); | ||||
105 | foreach my $text (@txt) { | ||||
106 | # we only care about the rendered version of the part | ||||
107 | my ($type, $rnd) = $text->rendered(); | ||||
108 | next unless defined $type; | ||||
109 | |||||
110 | # parse the rendered text into tokens. assume they are whitespace | ||||
111 | # separated, and ignore anything that doesn't have a word-character | ||||
112 | # in it (0-9a-zA-Z_) since those are probably things like bullet | ||||
113 | # points, horizontal lines, etc. this assumes that punctuation | ||||
114 | # in one part will be the same in other parts. | ||||
115 | # | ||||
116 | if ($type eq 'text/html') { | ||||
117 | foreach my $w (grep(/\w/, split(/\s+/, $rnd))) { | ||||
118 | #dbg("eval: HTML: $w"); | ||||
119 | $html{$w}++; | ||||
120 | } | ||||
121 | |||||
122 | # If there are no words, mark if there's at least 1 image ... | ||||
123 | if (!%html && exists $pms->{html}{inside}{img}) { | ||||
124 | # Use "\n" as the mark since it can't ever occur normally | ||||
125 | $html{"\n"}=1; | ||||
126 | } | ||||
127 | } | ||||
128 | else { | ||||
129 | foreach my $w (grep(/\w/, split(/\s+/, $rnd))) { | ||||
130 | #dbg("eval: TEXT: $w"); | ||||
131 | $text{$w}++; | ||||
132 | } | ||||
133 | } | ||||
134 | } | ||||
135 | |||||
136 | # How many HTML tokens do we have at the start? | ||||
137 | my $orig = keys %html; | ||||
138 | next if ($orig == 0); | ||||
139 | |||||
140 | $pms->{madiff_html} = $orig; | ||||
141 | $pms->{madiff_text} = keys %text; | ||||
142 | dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html}); | ||||
143 | |||||
144 | # If the token appears at least as many times in the text part as | ||||
145 | # in the html part, remove it from the list of html tokens. | ||||
146 | while(my ($k,$v) = each %text) { | ||||
147 | delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1); | ||||
148 | } | ||||
149 | |||||
150 | #map { dbg("eval: LEFT: $_") } keys %html; | ||||
151 | |||||
152 | # In theory, the tokens should be the same in both text and html | ||||
153 | # parts, so there would be 0 tokens left in the html token list, for | ||||
154 | # a 0% difference rate. Calculate it here, and record the difference | ||||
155 | # if it's been the highest so far in this message. | ||||
156 | my $diff = scalar(keys %html)/$orig*100; | ||||
157 | $pms->{madiff} = $diff if ($diff > $pms->{madiff}); | ||||
158 | |||||
159 | dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff}); | ||||
160 | } | ||||
161 | |||||
162 | return; | ||||
163 | } | ||||
164 | |||||
165 | sub check_blank_line_ratio { | ||||
166 | my ($self, $pms, $fulltext, $min, $max, $minlines) = @_; | ||||
167 | |||||
168 | if (!defined $minlines || $minlines < 1) { | ||||
169 | $minlines = 1; | ||||
170 | } | ||||
171 | |||||
172 | my $blank_line_ratio_ref = $pms->{blank_line_ratio}; | ||||
173 | |||||
174 | if (! exists $blank_line_ratio_ref->{$minlines}) { | ||||
175 | $fulltext = $pms->get_decoded_body_text_array(); | ||||
176 | |||||
177 | my $blank = 0; | ||||
178 | my $nlines = 0; | ||||
179 | foreach my $chunk (@$fulltext) { | ||||
180 | foreach (split(/^/m, $chunk, -1)) { | ||||
181 | $nlines++; | ||||
182 | $blank++ if !/\S/; | ||||
183 | } | ||||
184 | } | ||||
185 | |||||
186 | # report -1 if it's a blank message ... | ||||
187 | $blank_line_ratio_ref->{$minlines} = | ||||
188 | $nlines < $minlines ? -1 : 100 * $blank / $nlines; | ||||
189 | } | ||||
190 | |||||
191 | return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) || | ||||
192 | ($blank_line_ratio_ref->{$minlines} > $min && | ||||
193 | $blank_line_ratio_ref->{$minlines} <= $max)); | ||||
194 | } | ||||
195 | |||||
196 | sub tvd_vertical_words { | ||||
197 | my ($self, $pms, $text, $min, $max) = @_; | ||||
198 | |||||
199 | # klugy | ||||
200 | $max = 101 if ($max >= 100); | ||||
201 | |||||
202 | if (!defined $pms->{tvd_vertical_words}) { | ||||
203 | $pms->{tvd_vertical_words} = -1; | ||||
204 | |||||
205 | foreach (@{$text}) { | ||||
206 | my $l = length $_; | ||||
207 | next unless ($l > 5); | ||||
208 | my $spaces = tr/ / /; | ||||
209 | my $nonspaces = $l - $spaces; | ||||
210 | my $pct; | ||||
211 | if ($spaces > $nonspaces || $nonspaces == 0) { | ||||
212 | $pct = 100; | ||||
213 | } | ||||
214 | else { | ||||
215 | $pct = int(100*$spaces/$nonspaces); | ||||
216 | } | ||||
217 | $pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words}); | ||||
218 | } | ||||
219 | } | ||||
220 | |||||
221 | dbg("eval: tvd_vertical_words value: $pms->{tvd_vertical_words} / min: $min / max: $max - value must be >= min and < max"); | ||||
222 | return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max); | ||||
223 | } | ||||
224 | |||||
225 | sub check_stock_info { | ||||
226 | my ($self, $pms, $fulltext, $min) = @_; | ||||
227 | |||||
228 | $self->_check_stock_info($pms) unless (exists $pms->{stock_info}); | ||||
229 | |||||
230 | if ($min == 0 || $pms->{stock_info} >= $min) { | ||||
231 | return 1; | ||||
232 | } | ||||
233 | return 0; | ||||
234 | } | ||||
235 | |||||
236 | sub _check_stock_info { | ||||
237 | my ($self, $pms) = @_; | ||||
238 | $pms->{stock_info} = 0; | ||||
239 | |||||
240 | # Find all multipart/alternative parts in the message | ||||
241 | my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i); | ||||
242 | return if (!@parts); | ||||
243 | |||||
244 | # Go through each of the multipart parts | ||||
245 | my %hits; | ||||
246 | my $part = $parts[0]; | ||||
247 | my ($type, $rnd) = $part->rendered(); | ||||
248 | return unless $type; | ||||
249 | |||||
250 | # bug 5644,5717: avoid pathological cases where a regexp takes massive amount | ||||
251 | # of time by applying the regexp to limited-size text chunks, one at a time | ||||
252 | |||||
253 | foreach my $rnd_chunk ( | ||||
254 | Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd)) | ||||
255 | { | ||||
256 | foreach ( $rnd_chunk =~ /^\s*([^:\s][^:\n]{2,29})\s*:\s*\S/mg ) { | ||||
257 | my $str = lc $_; | ||||
258 | $str =~ tr/a-z//cd; | ||||
259 | #$str =~ s/([a-z])0([a-z])/$1o$2/g; | ||||
260 | |||||
261 | if ($str =~ /( | ||||
262 | ^trad(?:e|ing)date| | ||||
263 | company(?:name)?| | ||||
264 | s\w?(?:t\w?o\w?c\w?k|y\w?m(?:\w?b\w?o\w?l)?)| | ||||
265 | t(?:arget|icker)| | ||||
266 | (?:opening|current)p(?:rice)?| | ||||
267 | p(?:rojected|osition)| | ||||
268 | expectations| | ||||
269 | weeks?high| | ||||
270 | marketperformance| | ||||
271 | (?:year|week|month|day|price)(?:target|estimates?)| | ||||
272 | sector| | ||||
273 | r(?:ecommendation|ating) | ||||
274 | )$/x) { | ||||
275 | $hits{$1}++; | ||||
276 | dbg("eval: stock info hit: $1"); | ||||
277 | } | ||||
278 | } | ||||
279 | } | ||||
280 | |||||
281 | $pms->{stock_info} = scalar keys %hits; | ||||
282 | dbg("eval: stock info total: ".$pms->{stock_info}); | ||||
283 | |||||
284 | return; | ||||
285 | } | ||||
286 | |||||
287 | sub check_body_length { | ||||
288 | my ($self, $pms, undef, $min) = @_; | ||||
289 | |||||
290 | my $body_length = $pms->{msg}->{pristine_body_length}; | ||||
291 | dbg("eval: body_length - %s - check for min of %s", $body_length, $min); | ||||
292 | |||||
293 | return (defined $body_length && $body_length <= $min) ? 1 : 0; | ||||
294 | } | ||||
295 | |||||
296 | # --------------------------------------------------------------------------- | ||||
297 | |||||
298 | # capability checks for "if can()": | ||||
299 | # | ||||
300 | 10 | 71µs | # spent 96µs within Mail::SpamAssassin::Plugin::BodyEval::has_check_body_length which was called 10 times, avg 10µs/call:
# 10 times (96µs+0s) by Mail::SpamAssassin::Conf::Parser::cond_clause_can_or_has at line 595 of Mail/SpamAssassin/Conf/Parser.pm, avg 10µs/call | ||
301 | |||||
302 | 1 | 8µs | 1; |