1 use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
3 use Test::More tests => 13;
5 $a = "Våre norske tegn bør æres";
9 is($a, "Våre norske tegn bør æres");
13 is($a, "Våre norske tegn bør æres");
16 encode_entities_numeric($a);
18 is($a, "Våre norske tegn bør æres");
21 is(encode_entities($a), "<&>"'");
22 is(encode_entities_numeric($a), "<&>"'");
25 is(encode_entities($a, 'a-c'), "abcdef");
28 # See how well it does against rfc1866...
31 next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/;
40 # Try decoding when the ";" are left out
52 # From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au>
53 # Subject: HTML entities problem with 5.11
54 # To: libwww-perl@ics.uci.edu
55 # Date: Fri, 05 Sep 1997 16:56:55 +1000
56 # Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU>
58 # Hi. I've got a problem that has surfaced with the changes to
59 # HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening
60 # in the process of encoding then decoding special entities. Eg, what goes
61 # in as "abc&def&ghi" comes out as "abc&def;&ghi;".
63 is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;");
66 is(decode_entities("'"), "'");
67 is(encode_entities("'", "'"), "'");
69 is(decode_entities("Attention Homeοωnөrs...1ѕt Tімe Eνөг"),
70 "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}");
73 # Quoted from rfc1866.txt
77 The HTML DTD references the "Added Latin 1" entity set, which only
78 supplies named entities for a subset of the non-ASCII characters in
79 [ISO-8859-1], namely the accented characters. The following entities
80 should be supported so that all ISO 8859-1 characters may only be
81 referenced symbolically. The names for these entities are taken from
82 the appendixes of [SGML].
84 <!ENTITY nbsp CDATA " " -- no-break space -->
85 <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark -->
86 <!ENTITY cent CDATA "¢" -- cent sign -->
87 <!ENTITY pound CDATA "£" -- pound sterling sign -->
88 <!ENTITY curren CDATA "¤" -- general currency sign -->
89 <!ENTITY yen CDATA "¥" -- yen sign -->
90 <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar -->
91 <!ENTITY sect CDATA "§" -- section sign -->
92 <!ENTITY uml CDATA "¨" -- umlaut (dieresis) -->
93 <!ENTITY copy CDATA "©" -- copyright sign -->
94 <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine -->
95 <!ENTITY laquo CDATA "«" -- angle quotation mark, left -->
96 <!ENTITY not CDATA "¬" -- not sign -->
97 <!ENTITY shy CDATA "­" -- soft hyphen -->
98 <!ENTITY reg CDATA "®" -- registered sign -->
99 <!ENTITY macr CDATA "¯" -- macron -->
100 <!ENTITY deg CDATA "°" -- degree sign -->
101 <!ENTITY plusmn CDATA "±" -- plus-or-minus sign -->
102 <!ENTITY sup2 CDATA "²" -- superscript two -->
103 <!ENTITY sup3 CDATA "³" -- superscript three -->
104 <!ENTITY acute CDATA "´" -- acute accent -->
105 <!ENTITY micro CDATA "µ" -- micro sign -->
106 <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) -->
107 <!ENTITY middot CDATA "·" -- middle dot -->
108 <!ENTITY cedil CDATA "¸" -- cedilla -->
109 <!ENTITY sup1 CDATA "¹" -- superscript one -->
110 <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine -->
111 <!ENTITY raquo CDATA "»" -- angle quotation mark, right -->
112 <!ENTITY frac14 CDATA "¼" -- fraction one-quarter -->
113 <!ENTITY frac12 CDATA "½" -- fraction one-half -->
114 <!ENTITY frac34 CDATA "¾" -- fraction three-quarters -->
115 <!ENTITY iquest CDATA "¿" -- inverted question mark -->
116 <!ENTITY Agrave CDATA "À" -- capital A, grave accent -->
117 <!ENTITY Aacute CDATA "Á" -- capital A, acute accent -->
118 <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent -->
122 Berners-Lee & Connolly Standards Track [Page 75]
124 RFC 1866 Hypertext Markup Language - 2.0 November 1995
127 <!ENTITY Atilde CDATA "Ã" -- capital A, tilde -->
128 <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark -->
129 <!ENTITY Aring CDATA "Å" -- capital A, ring -->
130 <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) -->
131 <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla -->
132 <!ENTITY Egrave CDATA "È" -- capital E, grave accent -->
133 <!ENTITY Eacute CDATA "É" -- capital E, acute accent -->
134 <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent -->
135 <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark -->
136 <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent -->
137 <!ENTITY Iacute CDATA "Í" -- capital I, acute accent -->
138 <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent -->
139 <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark -->
140 <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic -->
141 <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde -->
142 <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent -->
143 <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent -->
144 <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent -->
145 <!ENTITY Otilde CDATA "Õ" -- capital O, tilde -->
146 <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark -->
147 <!ENTITY times CDATA "×" -- multiply sign -->
148 <!ENTITY Oslash CDATA "Ø" -- capital O, slash -->
149 <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent -->
150 <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent -->
151 <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent -->
152 <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark -->
153 <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent -->
154 <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic -->
155 <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) -->
156 <!ENTITY agrave CDATA "à" -- small a, grave accent -->
157 <!ENTITY aacute CDATA "á" -- small a, acute accent -->
158 <!ENTITY acirc CDATA "â" -- small a, circumflex accent -->
159 <!ENTITY atilde CDATA "ã" -- small a, tilde -->
160 <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark -->
161 <!ENTITY aring CDATA "å" -- small a, ring -->
162 <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) -->
163 <!ENTITY ccedil CDATA "ç" -- small c, cedilla -->
164 <!ENTITY egrave CDATA "è" -- small e, grave accent -->
165 <!ENTITY eacute CDATA "é" -- small e, acute accent -->
166 <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent -->
167 <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark -->
168 <!ENTITY igrave CDATA "ì" -- small i, grave accent -->
169 <!ENTITY iacute CDATA "í" -- small i, acute accent -->
170 <!ENTITY icirc CDATA "î" -- small i, circumflex accent -->
171 <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark -->
172 <!ENTITY eth CDATA "ð" -- small eth, Icelandic -->
173 <!ENTITY ntilde CDATA "ñ" -- small n, tilde -->
174 <!ENTITY ograve CDATA "ò" -- small o, grave accent -->
178 Berners-Lee & Connolly Standards Track [Page 76]
180 RFC 1866 Hypertext Markup Language - 2.0 November 1995
183 <!ENTITY oacute CDATA "ó" -- small o, acute accent -->
184 <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent -->
185 <!ENTITY otilde CDATA "õ" -- small o, tilde -->
186 <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark -->
187 <!ENTITY divide CDATA "÷" -- divide sign -->
188 <!ENTITY oslash CDATA "ø" -- small o, slash -->
189 <!ENTITY ugrave CDATA "ù" -- small u, grave accent -->
190 <!ENTITY uacute CDATA "ú" -- small u, acute accent -->
191 <!ENTITY ucirc CDATA "û" -- small u, circumflex accent -->
192 <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark -->
193 <!ENTITY yacute CDATA "ý" -- small y, acute accent -->
194 <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic -->
195 <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark -->