← Index
NYTProf Performance Profile   « line view »
For /usr/local/bin/sa-learn
  Run on Sun Nov 5 03:09:29 2017
Reported on Mon Nov 6 13:20:48 2017

Filename/usr/local/lib/perl5/site_perl/mach/5.24/Razor2/Signature/Whiplash.pm
StatementsExecuted 3 statements in 3.44ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
11143µs107µsRazor2::Signature::Whiplash::::BEGIN@10Razor2::Signature::Whiplash::BEGIN@10
0000s0sRazor2::Signature::Whiplash::::canonifyRazor2::Signature::Whiplash::canonify
0000s0sRazor2::Signature::Whiplash::::debugRazor2::Signature::Whiplash::debug
0000s0sRazor2::Signature::Whiplash::::extract_hostsRazor2::Signature::Whiplash::extract_hosts
0000s0sRazor2::Signature::Whiplash::::newRazor2::Signature::Whiplash::new
0000s0sRazor2::Signature::Whiplash::::next_hostRazor2::Signature::Whiplash::next_host
0000s0sRazor2::Signature::Whiplash::::whiplashRazor2::Signature::Whiplash::whiplash
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1#!/usr/local/bin/perl -sw
2##
3## Whiplash
4##
5## Author: Vipul Ved Prakash <mail@vipul.net>.
6## $Id: Whiplash.pm,v 1.7 2007/05/08 22:22:36 rsoderberg Exp $
7
8package Razor2::Signature::Whiplash;
9
1023.43ms2171µs
# spent 107µs (43+64) within Razor2::Signature::Whiplash::BEGIN@10 which was called: # once (43µs+64µs) by Razor2::Engine::VR8::BEGIN@2 at line 10
use Digest::SHA1;
# spent 107µs making 1 call to Razor2::Signature::Whiplash::BEGIN@10 # spent 64µs making 1 call to Exporter::import
11
12sub new {
13
14 my ($class, %args) = @_;
15 my %self = (
16 uri_terminators => "/><\"",
17 length_error => 100,
18 al_terminators => " /><\"\r\n",
19 );
20
21 my @DPL = qw(
22
23 .com
24 .net
25 .org
26 .info
27 .biz
28 .edu
29
30 .gov.ar
31 .int.ar
32 .net.ar
33 .com.ar
34 .mil.ar
35 .ar
36
37 .com.au
38 .org.au
39 .gov.au
40 .org.au
41 .id.au
42 .oz.au
43 .info.au
44 .net.au
45 .asn.au
46 .csiro.au
47 .telememo.au
48 .conf.au
49 edu.au
50 .au
51
52 .com.az
53 .net.az
54 .org.az
55 .az
56
57 .art.br
58 .com.br
59 .esp.br
60 .etc.br
61 .g12.br
62 .gov.br
63 .ind.br
64 .inf.br
65 .mil.br
66 .net.br
67 .org.br
68 .pro.br
69 .psi.br
70 .rec.br
71 .tmp.br
72 .br
73
74 .ab.ca
75 .bc.ca
76 .gc.ca
77 .mb.ca
78 .nf.ca
79 .ns.ca
80 .nt.ca
81 .on.ca
82 .pe.ca
83 .qc.ca
84 .sk.ca
85 .yk.ca
86 .ca
87
88 .ac.cn
89 .com.cn
90 .edu.cn
91 .gov.cn
92 .net.cn
93 .org.cn
94 .bj.cn
95 .sh.cn
96 .tj.cn
97 .cq.cn
98 .he.cn
99 .sx.cn
100 .nm.cn
101 .ln.cn
102 .jl.cn
103 .hl.cn
104 .js.cn
105 .zj.cn
106 .ah.cn
107 .hb.cn
108 .hn.cn
109 .gd.cn
110 .gx.cn
111 .hi.cn
112 .sc.cn
113 .gz.cn
114 .yn.cn
115 .xz.cn
116 .sn.cn
117 .gs.cn
118 .qh.cn
119 .nx.cn
120 .xj.cn
121 .tw.cn
122 .hk.cn
123 .mo.cn
124 .cn
125
126 .arts.co
127 .com.co
128 .edu.co
129 .firm.co
130 .gov.co
131 .info.co
132 .int.co
133 .nom.co
134 .mil.co
135 .org.co
136 .rec.co
137 .store.co
138 .web.co
139 .co
140
141 .ac.cr
142 .co.cr
143 .ed.cr
144 .fi.cr
145 .go.cr
146 .or.cr
147 .sa.cr
148 .cr
149
150 .com.cu
151 .net.cu
152 .org.cu
153 .cu
154
155 .ac.cy
156 .com.cy
157 .gov.cy
158 .net.cy
159 .org.cy
160 .cy
161
162 .cz
163
164 .de
165
166 .com.ec
167 .k12.ec
168 .edu.ec
169 .fin.ec
170 .med.ec
171 .gov.ec
172 .mil.ec
173 .org.ec
174 .net.ec
175 .ec
176
177 .com.eg
178 .edu.eg
179 .eun.eg
180 .gov.eg
181 .net.eg
182 .org.eg
183 .sci.eg
184 .eg
185
186 .ac.fj
187 .com.fj
188 .gov.fj
189 .id.fj
190 .org.fj
191 .school.fj
192 .fj
193
194 .site.voila.fr
195 .fr
196
197 .com.ge
198 .edu.ge
199 .gov.ge
200 .mil.ge
201 .net.ge
202 .org.ge
203 .pvt.ge
204 .ge
205
206 .co.gg
207 .org.gg
208 .sch.gg
209 .ac.gg
210 .gov.gg
211 .ltd.gg
212 .ind.gg
213 .net.gg
214 .alderney.gg
215 .guernsey.gg
216 .sark.gg
217 .gg
218
219 .edu.gu
220 .com.gu
221 .mil.gu
222 .gov.gu
223 .net.gu
224 .org.gu
225 .gu
226
227 .com.hk
228 .edu.hk
229 .gov.hk
230 .idv.hk
231 .net.hk
232 .org.hk
233 .hk
234
235 .co.hu
236 .org.hu
237 .priv.hu
238 .info.hu
239 .tm.hu
240 .nui.hu
241 .hu
242
243 .ac.id
244 .co.id
245 .go.id
246 .mil.id
247 .net.id
248 .or.id
249 .id
250
251 .k12.il
252 .org.il
253 .ac.il
254 .gov.il
255 .muni.il
256 .co.il
257 .net.il
258 .il
259
260 .co.im
261 .lkd.co.im
262 .plc.co.im
263 .net.im
264 .gov.im
265 .org.im
266 .nic.im
267 .ac.im
268 .im
269
270 .ernet.in
271 .nic.in
272 .ac.in
273 .co.in
274 .gov.in
275 .net.in
276 .res.in
277 .in
278
279 .com.jo
280 .gov.jo
281 .edu.jo
282 .net.jo
283 .jo
284
285 .co.jp
286 .ne.jp
287 .or.jp
288 .lg.jp
289 .ne.jp
290 .ad.jp
291 .ac.jp
292 .go.jp
293 .gr.jp
294 .jp
295
296 .ac.kr
297 .co.kr
298 .go.kr
299 .ne.kr
300 .or.kr
301 .re.kr
302 .pe.kr
303 .seoul.kr
304 .kyonggi.kr
305
306 .com.la
307 .net.la
308 .org.la
309 .la
310
311 .com.lb
312 .org.lb
313 .net.lb
314 .gov.lb
315 .mil.lb
316 .lb
317
318 .com.lc
319 .edu.lc
320 .gov.lc
321 .net.lc
322 .org.lc
323 .lc
324
325 .com.lv
326 .edu.lv
327 .gov.lv
328 .org.lv
329 .mil.lv
330 .id.lv
331 .net.lv
332 .asn.lv
333 .conf.lv
334 .lv
335
336 .com.ly
337 .net.ly
338 .org.ly
339 .ly
340
341 .edu.mm
342 .com.mm
343 .gov.mm
344 .net.mm
345 .org.mm
346 .mm
347
348 .com.mo
349 .edu.mo
350 .gov.mo
351 .net.mo
352 .org.mo
353 .mo
354
355 .com.mt
356 .net.mt
357 .org.mt
358 .mt
359
360 .com.mx
361 .net.mx
362 .org.mx
363 .mx
364
365 .com.my
366 .org.my
367 .gov.my
368 .edu.my
369 .net.my
370 .my
371
372 .com.na
373 .org.na
374 .net.na
375 .na
376
377 .com.nc
378 .net.nc
379 .org.nc
380 .nc
381
382 .ne
383
384 .nf
385
386 .ng
387
388 .com.ni
389 .ni
390
391 .com.np
392 .net.np
393 .ort.np
394 .np
395
396 .co.nz
397 .net.nz
398 .govt.nz
399 .ac.nz
400 .nz
401
402 .ac.pa
403 .com.pa
404 .net.pa
405 .org.pa
406 .edu.pa
407 .gob.pa
408 .sld.pa
409 .pa
410
411 .com.pe
412 .net.pe
413 .org.pe
414 .pe
415
416 .com.ph
417 .net.ph
418 .org.ph
419 .mil.ph
420 .ngo.ph
421 .ph
422
423 .com.pl
424 .net.pl
425 .org.pl
426 .pl
427
428 .com.py
429 .net.py
430 .org.py
431 .edu.py
432 .py
433
434 .org.ru
435 .net.ru
436 .pp.ru
437 .com.ru
438 .ru
439
440 .com.sg
441 .net.sg
442 .org.sg
443 .edu.sg
444 .gov.sg
445 .sg
446
447 .com.sh
448 .edu.sh
449 .gov.sh
450 .net.sh
451 .mil.sh
452 .org.sh
453 .sh
454
455 .co.sv
456 .sv
457
458 .com.sy
459 .net.sy
460 .org.sy
461 .sy
462
463 .ac.th
464 .co.th
465 .go.th
466 .net.th
467 .or.th
468 .in.th
469 .th
470
471 .com.tn
472 .ind.tn
473 .tourism.tn
474 .fin.tn
475 .net.tn
476 .gov.tn
477 .nat.tn
478 .org.tn
479 .info.tn
480 .ens.tn
481 .intl.tn
482 .rnrt.tn
483 .rnu.tn
484 .rns.tn
485 .edunet.tn
486 .tn
487
488 .bbs.tr
489 .com.tr
490 .edu.tr
491 .gov.tr
492 .k12.tr
493 .mil.tr
494 .net.tr
495 .org.tr
496 .tr
497
498 .com.tw
499 .net.tw
500 .org.tw
501 .gove.tw
502 .tw
503
504 .com.ua
505 .net.ua
506 .gov.ua
507 .ua
508
509 .ac.ug
510 .co.ug
511 .or.ug
512 .go.ug
513 .ug
514
515 .ac.uk
516 .co.uk
517 .gov.uk
518 .ltd.uk
519 .me.uk
520 .mod.uk
521 .net.uk
522 .nic.uk
523 .nhs.uk
524 .org.uk
525 .plc.uk
526 .police.uk
527 .sch.uk
528 .uk
529
530 .ak.us
531 .al.us
532 .ar.us
533 .az.us
534 .sf.ca.us
535 .ca.us
536 .co.us
537 .ct.us
538 .dc.us
539 .de.us
540 .fed.us
541 .fl.us
542 .ga.us
543 .hi.us
544 .ia.us
545 .id.us
546 .il.us
547 .in.us
548 .isa.us
549 .kids.us
550 .ks.us
551 .ky.us
552 .la.us
553 .ma.us
554 .md.us
555 .me.us
556 .mi.us
557 .mn.us
558 .mo.us
559 .ms.us
560 .mt.us
561 .nc.us
562 .nd.us
563 .ne.us
564 .nh.us
565 .nj.us
566 .nm.us
567 .nsn.us
568 .nv.us
569 .ny.us
570 .oh.us
571 .ok.us
572 .or.us
573 .pa.us
574 .ri.us
575 .sc.us
576 .sd.us
577 .tn.us
578 .tx.us
579 .ut.us
580 .vt.us
581 .va.us
582 .wa.us
583 .wi.us
584 .wv.us
585 .wy.us
586 .us
587
588 .com.uy
589 .edu.uy
590 .net.uy
591 .org.uy
592 .uy
593
594 .com.ve
595 .edu.ve
596 .gov.ve
597 .net.ve
598 .co.ve
599 .bib.ve
600 .tec.ve
601 .int.ve
602 .org.ve
603 .firm.ve
604 .store.ve
605 .web.ve
606 .arts.ve
607 .rec.ve
608 .info.ve
609 .nom.ve
610 .mil.ve
611 .ve
612
613 .co.vi
614 .net.vi
615 .org.vi
616 .vi
617
618 .ac.yu
619 .co.yu
620 .edu.yu
621 .org.yu
622 .yu
623
624 .ws
625
626 .ac.za
627 .alt.za
628 .co.za
629 .edu.za
630 .gov.za
631 .mil.za
632 .net.za
633 .ngo.za
634 .nom.za
635 .org.za
636 .school.za
637 .tm.za
638 .web.za
639 .za
640
641 );
642
643 $self{dpl} = [@DPL];
644 return bless \%self, $class;
645
646}
647
648
649sub whiplash {
650
651 my ($self, $text) = @_;
652
653 # Wrap all the text in case the URL is broken up on multiple lines.
654
655 # $text =~ s/[\r\n]//g;
656
657 return unless $text;
658
659 my @hosts = $self->extract_hosts($text);
660
661 unless (scalar @hosts) {
662
663 # No hostnames were found in the text,
664 # return undef;
665
666 debug("No hosts found in the message.");
667
668 return;
669
670 }
671
672 # We have one or more hosts. Generate one signature for each host.
673
674 my $length = length($text);
675 my $corrected_length = $length - ($length % $$self{length_error});
676
677 my @sigs;
678 my %sig_meta;
679
680 for my $host (@hosts) {
681
682 # Compute a SHA1 of host and corrected length. The corrected length is
683 # the value of length to the nearest multiple of ``length_error''.
684 # Take the first 20 hex chars from SHA1 and call it the signature.
685
686 my $sha1 = Digest::SHA1->new();
687
688 $sha1->add($host);
689 $sig = substr $sha1->hexdigest, 0, 12;
690
691 $sha1->add($corrected_length);
692 $sig .= substr $sha1->hexdigest, 0, 4;
693
694 push @sigs, $sig;
695 $sig_meta{$sig} = [$host, $corrected_length];
696
697 debug("$sig ($host + $corrected_length)");
698
699 }
700
701 return (\@sigs, \%sig_meta);
702
703}
704
705
706sub extract_hosts {
707
708 my ($self, $text) = @_;
709
710 #
711 # Test Vectors:
712 #
713 # 1. http://www.nodg.com@www.geocities.com/nxcisdsfdfdsy/off
714 # 2. http://www.ksleybiuh.com@213.171.60.74/getoff/
715 # 3. <http://links.verotel.com/cgi-bin/showsite.verotel?vercode=12372:9804000000374206>
716 # 4. http://217.12.4.7/rmi/http://definethis.net/526/index.html
717 # 5. http://magalygr8sex.free-host.com/h.html
718 # 6. http://%3CVenkatrs%3E@218.80.74.102/thecard/4index.htm
719 # 7. http://EBCDVKIGURGGCEOKXHINOCANVQOIDOXJWTWGPC@218.80.74.102/thecard/5in
720 # 8. http://g.india2.bag.gs/remove_page.htm
721 # 9. https://220.97.40.149
722 # 10. http://&#109;j&#97;k&#101;d.b&#105;z/u&#110;&#115;&#117;bscr&#105;&#98;e&#46;d&#100;d?leaving
723 # 11. http://g5j99m8@it.rd.yahoo.com/bassi/*http://www.lekobas.com/c/index.php
724 # 12. <a href="http://Chettxuydyhv vwyyrcmgbxzj n as ecq kkurxtrvaug nfsygjjjwhfkpaklh t a qsc exinscfjtxr
725 # jobg @www.mmv9.org?affil=19">look great / feel great</a>
726 # 13. <A
727 # HREF="http://href=www.churchwomen.comhref=www.cairn.nethref=www.teeter.orghr
728 # ef=www.lefty.bizhref=wwwbehold.pitfall@www.mmstong5f.com/host/index.asp?ID=0
729 # 1910?href=www.corrode.comhref=www.ode.nethref=www.clergy.orghref=www.aberrat
730 # e.biz" >
731 # 14. www.pillzthatwork.com # anything that starts with www.
732 #
733
734 # Decode Hex URI encoding (TV #6) (SPEC-REF: UNESCAPE)
735 $text =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
736
737 # Decode Decimal URI encoding (TV #10) (SPEC-REF: UNESCAPE)
738 $text =~ s/\&\#([0-9]{2,3})\;/chr($1)/eg;
739
740 debug("host_tokens(): will attempt to extract host names");
741
742 my @hosts;
743 my @autolinks = $text =~ m|\s+(www.[^$$self{al_terminators}]+)|ig; # Outlook with autolink these URLs
744 push @hosts, @autolinks;
745
746 #
747 # We extract host portions from all HTTP/HTTPS URIs found on the text.
748 # URIs are decoded if they are encoded, usernames (usually random) are
749 # thrown away and all unique hosts are extracted.
750 #
751
752 if ($text =~ m|^.*?href\s*=\s*"?https?://?(.*)$|si) {
753 $text = "a href = http://$1";
754 } elsif ($text =~ m|^.*?https?://?(.*)$|si) {
755 $text = "http://$1";
756 } else {
757 return;
758 }
759
760 while ($host = next_host($text)) {
761
762 last unless $host;
763
764 # Strip to the domain or IP
765
766 my $canonical_domain;
767
768 if ($host =~ /^[\d\.]+$/) {
769
770 # This is an IP address, just use it.
771 $canonical_domain = $host;
772
773 } else {
774
775 # See if it's a non country domain. If so,
776 # we'll extract the hostname. (SPEC-REF: NORMALIZE)
777
778 $canonical_domain = $self->canonify($host);
779
780 }
781
782 # Ensure the hostname is not already in the list and that it is
783 # potentially a routable hostname: length > 1 and contains
784 # atleast one "."
785
786 unless (grep { /^\Q$canonical_domain\E$/ } @hosts) {
787 if ((length($canonical_domain) > 1) and ($canonical_domain =~ /\./)) {
788 push @hosts, $canonical_domain;
789 }
790 }
791
792 last unless $text =~ m"http://(.*)$";
793 $text = $1;
794
795 }
796
797 return @hosts;
798
799}
800
801
802sub next_host {
803
804 ($_) = @_;
805
806 my ($host, $authority);
807
808 # Algorithm:
809 # 1. Find http://
810 # 2. Find [@"></]
811 # 3. If found @, ignore everything before it and look for ["></]
812 # 4. Everything from @ to [">/?] is the host.
813 # 5. If @ was not found, the whole thing is the host
814 #
815
816 my $inside_href = 0;
817 if (/^a href/) {
818 $inside_href = 1;
819 s|^a href\s*=\s*||;
820 }
821
822 # Remove the protocol name
823 s|^http://||i;
824
825 # Find a terminator
826 if (( $inside_href and m|(.*?)[>\"\/\?\<]|s) or
827 (!$inside_href and m|(.*?)[>\"\/\?\<\n\r]|s)) {
828 $_ = $1;
829 }
830
831 # Remove the authority section if the URL has one
832 s/^[^@]*@//si;
833
834 # The host name is everything after the last `='
835 s/\S+=//si;
836 $host = $_;
837
838 # The host part cannot contains whitespace or linefeeds.
839 # Everything including and beyond these characters should be
840 # throw away.
841
842 $host =~ s/[\r\n\s].*$//s;
843
844 # />
845
846 # Lowercase the hostname and remove ``='' chars (which can be part
847 # of the hostname sometimes when deQP didn't work correctly.
848
849 $host = lc($host);
850 $host =~ s/=//g;
851 $host =~ s/\s*$//g;
852
853 # Throw away the TCP port spec
854
855 $host =~ s/:.*$//;
856
857 # Throw away ``.'' at the end
858
859 $host =~ s/\.$//;
860
861 return $host;
862
863}
864
865
866sub canonify {
867
868 my ($self, $host) = @_;
869
870 # Extract canonical domain name. See the section on
871 # Domain Part List in the Whiplash spec for details on
872 # how this works.
873
874 for my $pattern (@{$$self{dpl}}) {
875
876 if ($pattern =~ /^\./) {
877 if ($host =~ /([^\.]+\Q$pattern\E)$/) {
878 return $1;
879 }
880 } else {
881 if ($host =~ /\Q$pattern\E$/) {
882 return $pattern;
883 }
884 }
885
886 }
887
888 return $host;
889
890}
891
892
893sub debug {
894 my $message = shift;
895 # print "debug: $message\n";
896}
897
898
89917µs1;
900