Status: URL rewrite feature has converted $h_init to " . &he($url) . ".
\n"; } } return $url; } sub check_regex { my ($pattern) = @_; my $err = ''; Err: { if ($pattern =~ m!\?\{!) { $err = &pstr(50,&he($pattern)); next Err; } eval '"foo" =~ m!$pattern!;'; if ($@) { $err = &pstr(51,&he($pattern,$@)); undef($@); next Err; } } return $err; } sub pstr { local $_ = $::str[$_[0]]; my $x = 0; foreach $x (1..((scalar @_) - 1)) { my $c = (s!\$s$x!$_[$x]!g); #&Assert($c != 0); } #&Assert( $_ !~ m!\$s\d! ); return $_; } sub ppstr { local $_ = $::str[$_[0]]; #&Assert(defined($_)); my $x = 0; foreach $x (1..((scalar @_) - 1)) { #&Assert(defined($_[$x])); my $c = (s!\$s$x!$_[$x]!g); #&Assert($c != 0); } #&Assert( $_ !~ m!\$s\d! ); print; } sub pppstr { local $_ = $::str[$_[0]]; my $x = 0; foreach $x (1..((scalar @_) - 1)) { my $c = (s!\$s$x!$_[$x]!g); #&Assert($c != 0); } #&Assert( $_ !~ m!\$s\d! ); if ($::const{'is_cmd'}) { print "\n$_\n"; } else { print "" . $_ . "
\n"; } } sub CompressStrip { local $_ = defined($_[0]) ? $_[0] : ''; $_ = &RawTranslate(" $_ "); s'\s+' 'og; eval($::private{'code_strip_ignored_words'}); die $@ if $@; s'\s+' 'og; s'^ ''; s' $''; return " $_ "; } sub entity_decode { my ($string, $b_return_only_ch, $p_ilen) = @_; my $elen = 0; # initialize; assume no entity match # decimal: if (($string =~ m!^\&\#(\d+)\;?$!) and ($1 < 256)) { $elen = length($string); $string = chr($1); } # hexidecimal: elsif (($string =~ m!^\&\#x([0-9a-f]+)\;?$!) and (hex($1) < 256)) { $elen = length($string); $string = chr(hex($1)); } # named entity, with explicit closing semicolon: elsif (($string =~ m!^\&(\w{2,8})\;$!) and (exists $::private{'p_entity_value_by_name'}->{$1})) { $elen = length($string); $string = $::private{'p_entity_value_by_name'}->{$1}; } # named entity, but without closing semicolon. # try to match longest possible string elsif ($string =~ m!^\&(\w{2,8})$!) { my $test = $1; my $len = length($test); while ($len > 1) { if (exists($::private{'p_entity_value_by_name'}->{ substr($test,0,$len) })) { $elen = 1 + $len; $string = $::private{'p_entity_value_by_name'}->{ substr($test,0,$len) }; $string .= substr($test, $len) unless ($b_return_only_ch); last; } $len--; } } if ($b_return_only_ch) { $$p_ilen = $elen; } return $string; } sub create_conversion_code { my ($b_verbose) = @_; my $code = ''; # Format of %charset is { char_number => [ @values, $name ] } # where @values represents what the character should be converted to under 4 circumstances # -1 means "strip, is non-word" # 0 means "leave as is" # any other string value is the value to be converted to my %base_charset = ( 9 => [ -1, -1, -1, -1, 'Horizontal tab'], 10 => [ -1, -1, -1, -1, 'Line feed'], 13 => [ -1, -1, -1, -1, 'Carriage Return'], 32 => [ -1, -1, -1, -1, 'Space'], 33 => [ -1, -1, -1, -1, 'Exclamation mark'], 34 => [ -1, -1, -1, -1, 'Quotation mark'], 35 => [ -1, -1, -1, -1, 'Number sign'], 36 => [ -1, -1, -1, -1, 'Dollar sign'], 37 => [ -1, -1, -1, -1, 'Percent sign'], 38 => [ -1, -1, -1, -1, 'Ampersand'], 39 => [ -1, -1, -1, -1, 'Apostrophe'], 40 => [ -1, -1, -1, -1, 'Left parenthesis'], 41 => [ -1, -1, -1, -1, 'Right parenthesis'], 42 => [ -1, -1, -1, -1, 'Asterisk'], 43 => [ -1, -1, -1, -1, 'Plus sign'], 44 => [ -1, -1, -1, -1, 'Comma'], 45 => [ -1, -1, -1, -1, 'Hyphen'], 46 => [ -1, -1, -1, -1, 'Period (fullstop)'], 47 => [ -1, -1, -1, -1, 'Solidus (slash)'], 48 => [ 0, 0, 0, 0, 'Digit 0'], 49 => [ 0, 0, 0, 0, 'Digit 1'], 50 => [ 0, 0, 0, 0, 'Digit 2'], 51 => [ 0, 0, 0, 0, 'Digit 3'], 52 => [ 0, 0, 0, 0, 'Digit 4'], 53 => [ 0, 0, 0, 0, 'Digit 5'], 54 => [ 0, 0, 0, 0, 'Digit 6'], 55 => [ 0, 0, 0, 0, 'Digit 7'], 56 => [ 0, 0, 0, 0, 'Digit 8'], 57 => [ 0, 0, 0, 0, 'Digit 9'], 58 => [ -1, -1, -1, -1, 'Colon'], 59 => [ -1, -1, -1, -1, 'Semicolon'], 60 => [ -1, -1, -1, -1, 'Less than'], 61 => [ -1, -1, -1, -1, 'Equals sign'], 62 => [ -1, -1, -1, -1, 'Greater than'], 63 => [ -1, -1, -1, -1, 'Question mark'], 64 => [ -1, -1, -1, -1, 'Commercial at'], 65 => [ 'a', 0, 'a', 0, 'Capital A'], 66 => [ 'b', 0, 'b', 0, 'Capital B'], 67 => [ 'c', 0, 'c', 0, 'Capital C'], 68 => [ 'd', 0, 'd', 0, 'Capital D'], 69 => [ 'e', 0, 'e', 0, 'Capital E'], 70 => [ 'f', 0, 'f', 0, 'Capital F'], 71 => [ 'g', 0, 'g', 0, 'Capital G'], 72 => [ 'h', 0, 'h', 0, 'Capital H'], 73 => [ 'i', 0, 'i', 0, 'Capital I'], 74 => [ 'j', 0, 'j', 0, 'Capital J'], 75 => [ 'k', 0, 'k', 0, 'Capital K'], 76 => [ 'l', 0, 'l', 0, 'Capital L'], 77 => [ 'm', 0, 'm', 0, 'Capital M'], 78 => [ 'n', 0, 'n', 0, 'Capital N'], 79 => [ 'o', 0, 'o', 0, 'Capital O'], 80 => [ 'p', 0, 'p', 0, 'Capital P'], 81 => [ 'q', 0, 'q', 0, 'Capital Q'], 82 => [ 'r', 0, 'r', 0, 'Capital R'], 83 => [ 's', 0, 's', 0, 'Capital S'], 84 => [ 't', 0, 't', 0, 'Capital T'], 85 => [ 'u', 0, 'u', 0, 'Capital U'], 86 => [ 'v', 0, 'v', 0, 'Capital V'], 87 => [ 'w', 0, 'w', 0, 'Capital W'], 88 => [ 'x', 0, 'x', 0, 'Capital X'], 89 => [ 'y', 0, 'y', 0, 'Capital Y'], 90 => [ 'z', 0, 'z', 0, 'Capital Z'], 91 => [ -1, -1, -1, -1, 'Left square bracket'], 92 => [ -1, -1, -1, -1, 'Reverse solidus (backslash)'], 93 => [ -1, -1, -1, -1, 'Right square bracket'], 94 => [ -1, -1, -1, -1, 'Caret'], 95 => [ -1, -1, -1, -1, 'Horizontal bar (underscore)'], 96 => [ -1, -1, -1, -1, 'Acute accent'], 97 => [ 0, 0, 0, 0, 'Small a'], 98 => [ 0, 0, 0, 0, 'Small b'], 99 => [ 0, 0, 0, 0, 'Small c'], 100 => [ 0, 0, 0, 0, 'Small d'], 101 => [ 0, 0, 0, 0, 'Small e'], 102 => [ 0, 0, 0, 0, 'Small f'], 103 => [ 0, 0, 0, 0, 'Small g'], 104 => [ 0, 0, 0, 0, 'Small h'], 105 => [ 0, 0, 0, 0, 'Small i'], 106 => [ 0, 0, 0, 0, 'Small j'], 107 => [ 0, 0, 0, 0, 'Small k'], 108 => [ 0, 0, 0, 0, 'Small l'], 109 => [ 0, 0, 0, 0, 'Small m'], 110 => [ 0, 0, 0, 0, 'Small n'], 111 => [ 0, 0, 0, 0, 'Small o'], 112 => [ 0, 0, 0, 0, 'Small p'], 113 => [ 0, 0, 0, 0, 'Small q'], 114 => [ 0, 0, 0, 0, 'Small r'], 115 => [ 0, 0, 0, 0, 'Small s'], 116 => [ 0, 0, 0, 0, 'Small t'], 117 => [ 0, 0, 0, 0, 'Small u'], 118 => [ 0, 0, 0, 0, 'Small v'], 119 => [ 0, 0, 0, 0, 'Small w'], 120 => [ 0, 0, 0, 0, 'Small x'], 121 => [ 0, 0, 0, 0, 'Small y'], 122 => [ 0, 0, 0, 0, 'Small z'], 123 => [ -1, -1, -1, -1, 'Left curly brace'], 124 => [ -1, -1, -1, -1, 'Vertical bar'], 125 => [ -1, -1, -1, -1, 'Right curly brace'], 126 => [ -1, -1, -1, -1, 'Tilde'], ); my %extended_charset = ( 138 => [ 's', 'S', chr(154), 0, 'Scaron'], 140 => [ 'oe', 'OE', chr(156), 0, 'OE ligature'], 142 => [ 'z', 'Z', chr(158), 0, ''], 154 => [ 's', 's', 0, 0, 'scaron'], 156 => [ 'oe', 'oe', 0, 0, 'oe ligature'], 158 => [ 'z', 'z', 0, 0, ''], 159 => [ 'y', 'Y', chr(255), 0, ''], 160 => [ -1, -1, -1, -1, 'Nonbreaking space'], 161 => [ -1, -1, -1, -1, 'Inverted exclamation'], 162 => [ -1, -1, -1, -1, 'Cent sign'], 163 => [ -1, -1, -1, -1, 'Pound sterling'], 164 => [ -1, -1, -1, -1, 'General currency sign'], 165 => [ -1, -1, -1, -1, 'Yen sign'], 166 => [ -1, -1, -1, -1, 'Broken vertical bar'], 167 => [ -1, -1, -1, -1, 'Section sign'], 168 => [ -1, -1, -1, -1, 'Diæresis / Umlaut'], 169 => [ -1, -1, -1, -1, 'Copyright'], 170 => [ -1, -1, -1, -1, 'Feminine ordinal'], 171 => [ -1, -1, -1, -1, 'Left angle quote, guillemet left'], 172 => [ -1, -1, -1, -1, 'Not sign'], 173 => [ -1, -1, -1, -1, 'Soft hyphen'], 174 => [ -1, -1, -1, -1, 'Registered trademark'], 175 => [ -1, -1, -1, -1, 'Macron accent'], 176 => [ -1, -1, -1, -1, 'Degree sign'], 177 => [ -1, -1, -1, -1, 'Plus or minus'], 178 => [ -1, -1, -1, -1, 'Superscript 2'], 179 => [ -1, -1, -1, -1, 'Superscript 3'], 180 => [ -1, -1, -1, -1, 'Acute accent'], 181 => [ -1, -1, -1, -1, 'Micro sign'], 182 => [ -1, -1, -1, -1, 'Paragraph sign'], 183 => [ -1, -1, -1, -1, 'Middle dot'], 184 => [ -1, -1, -1, -1, 'Cedilla'], 185 => [ -1, -1, -1, -1, 'Superscript 1'], 186 => [ -1, -1, -1, -1, 'Masculine ordinal'], 187 => [ -1, -1, -1, -1, 'Right angle quote, guillemet right'], 188 => [ -1, -1, -1, -1, 'Fraction one-fourth'], 189 => [ -1, -1, -1, -1, 'Fraction one-half'], 190 => [ -1, -1, -1, -1, 'Fraction three-fourths'], 191 => [ -1, -1, -1, -1, 'Inverted question mark'], 192 => [ 'a', 'A', chr(224), 0, 'Capital A, grave accent'], 193 => [ 'a', 'A', chr(225), 0, 'Capital A, acute accent'], 194 => [ 'a', 'A', chr(226), 0, 'Capital A, circumflex'], 195 => [ 'a', 'A', chr(227), 0, 'Capital A, tilde'], 196 => [ 'ae', 'Ae', chr(228), 0, 'Capital A, diaeresis / umlaut'], 197 => [ 'a', 'A', chr(229), 0, 'Capital A, ring'], 198 => [ 'ae', 'AE', chr(230), 0, 'Capital AE ligature'], 199 => [ 'c', 'c', chr(231), 0, 'Capital C, cedilla'], 200 => [ 'e', 'E', chr(232), 0, 'Capital E, grave accent'], 201 => [ 'e', 'E', chr(233), 0, 'Capital E, acute accent'], 202 => [ 'e', 'E', chr(234), 0, 'Capital E, circumflex'], 203 => [ 'e', 'E', chr(235), 0, 'Capital E, diaeresis / umlaut'], 204 => [ 'i', 'I', chr(236), 0, 'Capital I, grave accent'], 205 => [ 'i', 'I', chr(237), 0, 'Capital I, acute accent'], 206 => [ 'i', 'I', chr(238), 0, 'Capital I, circumflex'], 207 => [ 'i', 'I', chr(239), 0, 'Capital I, diaeresis / umlaut'], 208 => [ 'd', 'D', chr(240), 0, 'Capital Eth, Icelandic'], 209 => [ 'n', 'N', chr(241), 0, 'Capital N, tilde'], 210 => [ 'o', 'O', chr(242), 0, 'Capital O, grave accent'], 211 => [ 'o', 'O', chr(243), 0, 'Capital O, acute accent'], 212 => [ 'o', 'O', chr(244), 0, 'Capital O, circumflex'], 213 => [ 'o', 'O', chr(245), 0, 'Capital O, tilde'], 214 => [ 'oe', 'Oe', chr(246), 0, 'Capital O, diaeresis / umlaut'], 215 => [ -1, -1, -1, -1, 'Multiply sign'], 216 => [ 'o', 'O', chr(248), 0, 'Capital O, slash'], 217 => [ 'u', 'U', chr(249), 0, 'Capital U, grave accent'], 218 => [ 'u', 'U', chr(250), 0, 'Capital U, acute accent'], 219 => [ 'u', 'U', chr(251), 0, 'Capital U, circumflex'], 220 => [ 'ue', 'Ue', chr(252), 0, 'Capital U, diaeresis / umlaut'], 221 => [ 'y', 'Y', chr(253), 0, 'Capital Y, acute accent'], 222 => [ 'p', 'P', chr(254), 0, 'Capital Thorn, Icelandic'], 223 => [ 'ss', 'ss', 0, 0, 'Small sharp s, German sz'], 224 => [ 'a', 'a', 0, 0, 'Small a, grave accent'], 225 => [ 'a', 'a', 0, 0, 'Small a, acute accent'], 226 => [ 'a', 'a', 0, 0, 'Small a, circumflex'], 227 => [ 'a', 'a', 0, 0, 'Small a, tilde'], 228 => [ 'ae', 'ae', 0, 0, 'Small a, diaeresis / umlaut'], 229 => [ 'a', 'a', 0, 0, 'Small a, ring'], 230 => [ 'ae', 'ae', 0, 0, 'Small ae ligature'], 231 => [ 'c', 'c', 0, 0, 'Small c, cedilla'], 232 => [ 'e', 'e', 0, 0, 'Small e, grave accent'], 233 => [ 'e', 'e', 0, 0, 'Small e, acute accent'], 234 => [ 'e', 'e', 0, 0, 'Small e, circumflex'], 235 => [ 'e', 'e', 0, 0, 'Small e, diaeresis / umlaut'], 236 => [ 'i', 'i', 0, 0, 'Small i, grave accent'], 237 => [ 'i', 'i', 0, 0, 'Small i, acute accent'], 238 => [ 'i', 'i', 0, 0, 'Small i, circumflex'], 239 => [ 'i', 'i', 0, 0, 'Small i, diaeresis / umlaut'], 240 => [ 'o', 'o', 0, 0, 'Small eth, Icelandic'], 241 => [ 'n', 'n', 0, 0, 'Small n, tilde'], 242 => [ 'o', 'o', 0, 0, 'Small o, grave accent'], 243 => [ 'o', 'o', 0, 0, 'Small o, acute accent'], 244 => [ 'o', 'o', 0, 0, 'Small o, circumflex'], 245 => [ 'o', 'o', 0, 0, 'Small o, tilde'], 246 => [ 'oe', 'oe', 0, 0, 'Small o, diaeresis / umlaut'], 247 => [ -1, -1, -1, -1, 'Division sign'], 248 => [ 'o', 'o', 0, 0, 'Small o, slash'], 249 => [ 'u', 'u', 0, 0, 'Small u, grave accent'], 250 => [ 'u', 'u', 0, 0, 'Small u, acute accent'], 251 => [ 'u', 'u', 0, 0, 'Small u, circumflex'], 252 => [ 'ue', 'ue', 0, 0, 'Small u, diaeresis / umlaut'], 253 => [ 'y', 'y', 0, 0, 'Small y, acute accent'], 254 => [ 'p', 'p', 0, 0, 'Small thorn, Icelandic'], 255 => [ 'y', 'y', 0, 0, 'Small y, diaeresis / umlaut'], ); =item reserved The %reserved hash contains the Latin character index of characters that FDSE uses internally to delimit data, including newlines, whitespace, and the equals sign. These characters are *always* stripped from incoming data regardless of locale settings. =cut my %reserved = ( 34 => 1, 38 => 1, 60 => 1, 62 => 1, 9 => 1, 95 => 1, 10 => 1, 13 => 1, 32 => 1, 61 => 1, ); =item named_entities The %named_entities hash maps HTML entities to their Latin character index. Numeric formats like "#ddd" and "xHH" are programmatically added to the hash -- there is no need to manually add them. Named entities which do not map to alphanumeric "word" characters, like "amp", are omitted as an optimization, since those characters are never included in the index. =cut my %named_entities = ( '#338' => 140, '#339' => 156, '#352' => 138, '#353' => 154, 'AElig' => 198, 'Aacute' => 193, 'Acirc' => 194, 'Agrave' => 192, 'Aring' => 197, 'Atilde' => 195, 'Auml' => 196, 'Ccedil' => 199, 'ETH' => 208, 'Eacute' => 201, 'Ecirc' => 202, 'Egrave' => 200, 'Euml' => 203, 'Iacute' => 205, 'Icirc' => 206, 'Igrave' => 204, 'Iuml' => 207, 'Ntilde' => 209, 'OElig' => 140, 'Oacute' => 211, 'Ocirc' => 212, 'Ograve' => 210, 'Oslash' => 216, 'Otilde' => 213, 'Ouml' => 214, 'Scaron' => 138, 'THORN' => 222, 'Uacute' => 218, 'Ucirc' => 219, 'Ugrave' => 217, 'Uuml' => 220, 'Yacute' => 221, 'aacute' => 225, 'acirc' => 226, 'aelig' => 230, 'agrave' => 224, 'aring' => 229, 'atilde' => 227, 'auml' => 228, 'ccedil' => 231, 'eacute' => 233, 'ecirc' => 234, 'egrave' => 232, 'eth' => 240, 'euml' => 235, 'iacute' => 237, 'icirc' => 238, 'igrave' => 236, 'iquest' => 191, 'iuml' => 239, 'ntilde' => 241, 'oacute' => 243, 'ocirc' => 244, 'oelig' => 156, 'ograve' => 242, 'oslash' => 248, 'otilde' => 245, 'ouml' => 246, 'scaron' => 154, 'sup1' => 185, 'sup2' => 178, 'sup3' => 179, 'szlig' => 223, 'thorn' => 254, 'uacute' => 250, 'ucirc' => 251, 'ugrave' => 249, 'uuml' => 252, 'yacute' => 253, 'yuml' => 255, ); my @non_word_entities = qw! Alpha Beta Chi Dagger Delta Epsilon Eta Gamma Iota Kappa Lambda Mu Nu OElig Omega Omicron Phi Pi Prime Psi Rho Scaron Sigma Tau Theta Upsilon Xi Yuml Zeta acute alefsym alpha amp and ang apos asymp bdquo beta brvbar bull cap cedil cent chi circ clubs cong copy crarr cup curren dArr dagger darr deg delta diams divide empty emsp ensp epsilon equiv eta euro exist fnof forall frac12 frac14 frac34 frasl gamma ge gt hArr harr hearts hellip iexcl image infin int iota iquest isin kappa lArr lambda lang laquo larr lceil ldquo le lfloor lowast loz lrm lsaquo lsquo lt macr mdash micro middot minus mu nabla nbsp ndash ne ni not notin nsub nu oelig oline omega omicron oplus or ordf ordm otimes para part permil perp phi pi piv plusmn pound prime prod prop psi quot rArr radic rang raquo rarr rceil rdquo real reg rfloor rho rlm rsaquo rsquo sbquo scaron sdot sect shy sigma sigmaf sim spades sube sum sup sup1 sup2 sup3 supe tau there4 theta thetasym thinsp tilde times trade uArr uarr uml upsih upsilon weierp xi yen zeta zwj zwnj sub !; $::private{'p_entity_value_by_name'} = {}; foreach (@non_word_entities) { $::private{'p_entity_value_by_name'}->{ $_ } = ' '; } my %entity_name_by_num = (); my ($name, $number) = ('', 0); while (($name, $number) = each %named_entities) { $entity_name_by_num{ $number } .= "$name "; $::private{'p_entity_value_by_name'}->{ $name } = chr( $number ); } $::private{'p_single_char_map'} = []; my %ac_map_cs = (); my @nonword = (); my $focus = (2 + (-2 * $::Rules{'character conversion: accent insensitive'})) + (1 + (-1 * $::Rules{'character conversion: case insensitive'})); my $chx = 0; if (not $b_verbose) { for (my $chx = 255; $chx > 0; $chx--) { my $ch = chr($chx); my $value = -1; if (defined($base_charset{$chx})) { $value = $base_charset{$chx}[$focus]; } elsif (defined($extended_charset{$chx})) { $value = $extended_charset{$chx}[$focus]; } if ($value eq '-1') { $nonword[$chx] = 1; $::private{'p_single_char_map'}->[$chx] = ' '; } elsif ($value ne '0') { $ac_map_cs{$value} .= $ch; $::private{'p_single_char_map'}->[$chx] = $value; } else { $::private{'p_single_char_map'}->[$chx] = $ch; } } } else { print <<"EOM";$::str[62] | $::str[45] | $::str[61] | $::str[60] | $::str[59] $::str[57] |
$::str[59] $::str[56] |
$::str[58] $::str[57] |
$::str[58] $::str[56] |
||
---|---|---|---|---|---|---|---|---|---|
! . substr(1000 + $chx, 1, 3) . qq! | $data[4] | !;
if ($entity_name_by_num{$chx}) {
my @list = split(m!\s+!, $entity_name_by_num{$chx});
my $en;
foreach $en (@list) {
next unless ($en);
print '&' . "amp;$en; - &$en; "; } } else { print " "; } print qq! | ! . &he($ch) . " | ";
my $zz = 0;
for $zz (0..3) {
if ($zz == $focus) {
if ($data[$zz] eq '-1') {
print qq!--- | \n!; $nonword[$chx] = 1; } elsif ($data[$zz] eq '0') { print qq!$ch | \n!; } else { print qq!$data[$zz] | \n!; # format {dest} = {orig orig orig} $ac_map_cs{$data[$zz]} .= $ch; } } else { if ($data[$zz] eq '-1') { print qq!$ch | \n!; } else { print qq!$data[$zz] | \n!; } } } print "