Add more tests to cover edge cases that are legal in either HTML (bare

pointy brackets & ampersands) or XHTML (hexadecimal character references), but not both. Also added a test for DOCTYPE declaration parsing.

Add more tests to cover edge cases that are legal in either HTML (bare
pointy brackets & ampersands) or XHTML (hexadecimal character references), but not both. Also added a test for DOCTYPE declaration parsing.
aaec4852 · Fred Drake · cddf8940 · aaec4852
Commit aaec4852 authored Sep 04, 2001 by Fred Drake
Show whitespace changes
Inline Side-by-side

Showing with 45 additions and 5 deletions

lib/python/TAL/tests/test_htmlparser.py lib/python/TAL/tests/test_htmlparser.py +45 -5

No files found.
--- a/lib/python/TAL/tests/test_htmlparser.py
+++ b/lib/python/TAL/tests/test_htmlparser.py
@@ -62,6 +62,9 @@ class EventCollector(HTMLParser.HTMLParser):
    def handle_pi(self, data):
        self.append(("pi", data))

+    def unknown_decl(self, decl):
+        self.append(("unknown decl", decl))
+

 class EventCollectorExtra(EventCollector):

@@ -117,6 +120,7 @@ class HTMLParserTestCase(TestCaseBase):
 comment1b-->
 <Img sRc='Bar' isMAP>sample
 text
+&#x201C;
 <!--comment2a-- --comment2b-->
 </Html>
 """, [
@@ -131,13 +135,36 @@ text
    ("data", "\n"),
    ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
    ("data", "sample\ntext\n"),
+    ("charref", "x201C"),
+    ("data", "\n"),
    ("comment", "comment2a-- --comment2b"),
    ("data", "\n"),
    ("endtag", "html"),
    ("data", "\n"),
    ])

+    def check_doctype_decl(self):
+        inside = """\
+DOCTYPE html [
+  <!ELEMENT html - O EMPTY>
+  <!ATTLIST html
+      version CDATA #IMPLIED
+      profile CDATA 'DublinCore'>
+  <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
+  <!ENTITY myEntity 'internal parsed entity'>
+  <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
+  <!ENTITY % paramEntity 'name|name|name'>
+  %paramEntity;
+  <!-- comment -->
+]"""
+        self._run_check("<!%s>" % inside, [
+            ("decl", inside),
+            ])
+
    def check_bad_nesting(self):
+        # Strangely, this *is* supposed to test that overlapping
+        # elements are allowed.  HTMLParser is more geared toward
+        # lexing the input that parsing the structure.
        self._run_check("<a><b></a></b>", [
            ("starttag", "a", []),
            ("starttag", "b", []),
@@ -145,6 +172,16 @@ text
            ("endtag", "b"),
            ])

+    def check_bare_ampersands(self):
+        self._run_check("this text & contains & ampersands &", [
+            ("data", "this text & contains & ampersands &"),
+            ])
+
+    def check_bare_pointy_brackets(self):
+        self._run_check("this < text > contains < bare>pointy< brackets", [
+            ("data", "this < text > contains < bare>pointy< brackets"),
+            ])
+
    def check_attr_syntax(self):
        output = [
          ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
@@ -174,6 +211,14 @@ text
            ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
            ])

+    def check_illegal_declarations(self):
+        s = 'abc<!spacer type="block" height="25">def'
+        self._run_check(s, [
+            ("data", "abc"),
+            ("unknown decl", 'spacer type="block" height="25"'),
+            ("data", "def"),
+            ])
+
    def check_starttag_end_boundary(self):
        self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
        self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
@@ -196,17 +241,12 @@ text
        self._run_check(["<a b='>'", ">"], output)

    def check_starttag_junk_chars(self):
-        self._parse_error("<")
-        self._parse_error("<>")
        self._parse_error("</>")
        self._parse_error("</$>")
        self._parse_error("</")
        self._parse_error("</a")
-        self._parse_error("</a")
        self._parse_error("<a<a>")
        self._parse_error("</a<a>")
-        self._parse_error("<$")
-        self._parse_error("<$>")
        self._parse_error("<!")
        self._parse_error("<a $>")
        self._parse_error("<a")