← Index
NYTProf Performance Profile   « line view »
For /usr/local/bin/sa-learn
  Run on Sun Nov 5 03:09:29 2017
Reported on Mon Nov 6 13:20:48 2017

Filename/usr/local/lib/perl5/site_perl/mach/5.24/Razor2/Preproc/deHTML.pm
StatementsExecuted 1 statements in 6µs
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
0000s0sRazor2::Preproc::deHTML::::doitRazor2::Preproc::deHTML::doit
0000s0sRazor2::Preproc::deHTML::::html_xlatRazor2::Preproc::deHTML::html_xlat
0000s0sRazor2::Preproc::deHTML::::html_xlat_oldRazor2::Preproc::deHTML::html_xlat_old
0000s0sRazor2::Preproc::deHTML::::isitRazor2::Preproc::deHTML::isit
0000s0sRazor2::Preproc::deHTML::::newRazor2::Preproc::deHTML::new
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package Razor2::Preproc::deHTML;
2
3
4sub new {
5
6 my $class = shift;
7
8 my %html_tags = (
9 "lt" => '<', "gt" => '>', "amp" => '&',
10 "quot" => '"', "nbsp" => ' ', "iexcl" => chr(161),
11 "cent" => chr(162), "pound" => chr(163), "curren" => chr(164),
12 "yen" => chr(165), "brvbar" => chr(166), "sect" => chr(167),
13 "uml" => chr(168), "copy" => chr(169), "ordf" => chr(170),
14 "laquo" => chr(171), "not" => chr(172), "shy" => chr(173),
15 "reg" => chr(174), "macr" => chr(175), "deg" => chr(176),
16 "plusmn" => chr(177), "sup2" => chr(178), "sup3" => chr(179),
17 "acute" => chr(180), "micro" => chr(181), "para" => chr(182),
18 "middot" => chr(183), "cedil" => chr(184), "sup1" => chr(185),
19 "ordm" => chr(186), "raquo" => chr(187), "frac14" => chr(188),
20 "frac12" => chr(189), "frac34" => chr(190), "iquest" => chr(191),
21 "Agrave" => chr(192), "Aacute" => chr(193), "Acirc" => chr(194),
22 "Atilde" => chr(195), "Auml" => chr(196), "Aring" => chr(197),
23 "AElig" => chr(198), "Ccedil" => chr(199), "Egrave" => chr(200),
24 "Eacute" => chr(201), "Ecirc" => chr(202), "Euml" => chr(203),
25 "Igrave" => chr(204), "Iacute" => chr(205), "Icirc" => chr(206),
26 "Iuml" => chr(207), "ETH" => chr(208), "Ntilde" => chr(209),
27 "Ograve" => chr(210), "Oacute" => chr(211), "Ocirc" => chr(212),
28 "Otilde" => chr(213), "Ouml" => chr(214), "times" => chr(215),
29 "Oslash" => chr(216), "Ugrave" => chr(217), "Uacute" => chr(218),
30 "Ucirc" => chr(219), "Uuml" => chr(220), "Yacute" => chr(221),
31 "THORN" => chr(222), "szlig" => chr(223), "agrave" => chr(224),
32 "aacute" => chr(225), "acirc" => chr(226), "atilde" => chr(227),
33 "auml" => chr(228), "aring" => chr(229), "aelig" => chr(230),
34 "ccedil" => chr(231), "egrave" => chr(232), "eacute" => chr(233),
35 "ecirc" => chr(234), "euml" => chr(235), "igrave" => chr(236),
36 "iacute" => chr(237), "icirc" => chr(238), "iuml" => chr(239),
37 "eth" => chr(240), "ntilde" => chr(241), "ograve" => chr(242),
38 "oacute" => chr(243), "ocirc" => chr(244), "otilde" => chr(245),
39 "ouml" => chr(246), "divide" => chr(247), "oslash" => chr(248),
40 "ugrave" => chr(249), "uacute" => chr(250), "ucirc" => chr(251),
41 "uuml" => chr(252), "yacute" => chr(253), "thorn" => chr(254),
42 "yuml" => chr(255)
43 );
44
45 return bless {
46 html_tags => \%html_tags,
47 }, $class;
48
49}
50
51
52sub isit {
53
54 my ($self, $text) = @_;
55 my $isit = 0;
56 my ($hdr, $body) = split /\n\r*\n/, $$text, 2;
57
58 return 0 unless $body;
59
60 $isit = $body =~ /(?:<HTML>|<BODY|<FONT|<A HREF)/ism;
61 return $isit if $isit;
62
63 $isit = $hdr =~ m"^Content-Type: text/html"ism;
64 return $isit;
65
66}
67
68# NOTE: Tag designations _are_ case sensitive.
69# Returns: (length of tag detected, char)
70# So the caller can basically do ``$i += $length_detected'', above.
71#
72# The big difference between this function and its C equivalent is
73# that we're not modifying the char array that was passed in, but
74# rather instead advising the caller on how much to skip/eat.
75
76sub html_xlat_old {
77
78 my($self, $chars, $i) = @_;
79 my($tag, $val);
80 my($r_val);
81 my $r_tag = "";
82
83 return 0 if ($$chars[$i] !~ /[a-zA-Z]/);
84
85 # first figure out which is shorter, $i->EOS, or $i+10, and use
86 # that to build a compare string for use with substr. Otherwise,
87 # requesting ``lengths'' from char arrays that exceeds the actual
88 # array produce additional 'undef's. 10 is an arbitrary #, but at
89 # least greater than the max length (+ 1) of any html &tag.
90
91 my($offset) = (scalar @{$chars} - $i > 10 ? 10 : scalar @{$chars} );
92 my($s) = join ('', @{$chars}[$i .. $i + $offset]);
93
94 while ( ($tag, $val) = each %{$self->{html_tags}} ) {
95
96 if (substr($s, 0, length($tag)) eq $tag) {
97 ($r_tag, $r_val) = ($tag => $val);
98 $r_tag .= ';' # so the length($r_tag) consumes the ``;''
99 if (substr($s, length($tag), 1) eq ';');
100 }
101 }
102
103 return (length($r_tag), $r_val);
104
105}
106
107sub html_xlat {
108 my($self, $chars, $i) = @_;
109
110 #print "html_xlat($r_tag) start\n";
111 return 0 if ($$chars[$i] !~ /[a-zA-Z]/);
112
113 my $r_tag;
114 # we used to walk till we got a ';', but to be compatible
115 # with c, we won't check for ';'
116 while ($$chars[$i] && $$chars[$i] =~ /[a-zA-Z]/) {
117 $r_tag .= $$chars[$i++];
118 }
119 my $len = length($r_tag); # do not include ;
120 $len++ if ($$chars[$i] eq ';');
121
122 my $val = $self->{html_tags}->{$r_tag};
123 #print "html_xlat($r_tag) = ($len,$val)\n";
124
125 return 0 unless $val; # not found
126 return ( $len, $val );
127}
128
129sub doit {
130
131 my ($self, $text) = @_;
132 my ($hdr, $body) = split /\n\r*\n/, $$text, 2;
133
134 my(@chars) = split //, $body;
135 my($len) = scalar @chars;
136
137 my($last, $quote, $sgml, $tag) = ("", "", "", "");
138 my(@out);
139 my($i) = 0;
140
141 while ($i < $len) {
142 my($c) = $chars[$i++];
143
144 if ($c eq $quote) {
145
146 if ($c eq '-' && $last ne '-') {
147 $last = $c;
148 next;
149 } else {
150 $last = 0;
151 }
152
153 $quote = "";
154
155 } elsif (!$quote) {
156
157 if ($c eq '<') {
158
159 $tag = 1;
160 if ($chars[$i++] eq '!') {
161 my($s) = join('', @chars[$i .. $i + 10]);
162 $sgml = 1;
163 }
164
165 } elsif ($c eq '>') {
166
167 if ($tag) {
168
169 $sgml = 0;
170 $tag = 0;
171 }
172
173 } elsif ($c eq '-') {
174
175 if ($sgml and $last eq '-') {
176
177 $quote = '-';
178 } else {
179 push @out, $c if (! $tag);
180 }
181
182 } elsif ($c eq "\"" or $c eq "\'") {
183
184 if ($tag) {
185 $quote = $c ;
186 } else {
187 push @out, $c if (! $tag);
188 }
189
190 } elsif ($c eq "&") {
191
192 my($len, $char) = $self->html_xlat(\@chars, $i);
193 if ($len) {
194 push @out, $char;
195 $i += $len;
196 } else {
197 push @out, $c;
198 }
199
200 } else {
201
202 push @out, $c if (! $tag);
203
204 }
205
206 }
207
208 $last = $c;
209 }
210
211 $$text = "$hdr\n\n". join('', @out);
212
213}
214
215
21616µs1;
217