Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
7b4ae31a
Commit
7b4ae31a
authored
Jul 14, 2004
by
Fred Drake
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
remove local version of the HTMLParser module; this is now part of Python
parent
3c86de13
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
715 deletions
+0
-715
lib/python/TAL/HTMLParser.py
lib/python/TAL/HTMLParser.py
+0
-402
lib/python/TAL/tests/test_htmlparser.py
lib/python/TAL/tests/test_htmlparser.py
+0
-313
No files found.
lib/python/TAL/HTMLParser.py
deleted
100644 → 0
View file @
3c86de13
"""A parser for HTML and XHTML."""
# This file is based on sgmllib.py, but the API is slightly different.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
import
markupbase
import
re
# Regular expressions used for parsing
interesting_normal
=
re
.
compile
(
'[&<]'
)
interesting_cdata
=
re
.
compile
(
r'<(/|\
Z)
')
incomplete = re.compile('
&
[
a
-
zA
-
Z
#]')
entityref
=
re
.
compile
(
'&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]'
)
charref
=
re
.
compile
(
'&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]'
)
starttagopen
=
re
.
compile
(
'<[a-zA-Z]'
)
piclose
=
re
.
compile
(
'>'
)
endtagopen
=
re
.
compile
(
'</'
)
commentclose
=
re
.
compile
(
r'--\
s*>
')
tagfind = re.compile('
[
a
-
zA
-
Z
][
-
.
a
-
zA
-
Z0
-
9
:
_
]
*
')
attrfind = re.compile(
r'
\
s
*
([
a
-
zA
-
Z_
][
-
.:
a
-
zA
-
Z_0
-
9
]
*
)(
\
s
*=
\
s
*
'
r'
(
\
'[^
\
'
]*
\
'
|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$
\
(
\
)_#=~]*))?'
)
locatestarttagend
=
re
.
compile
(
r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\
s+ # whi
tespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\
s*=
\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\
s]+ #
bare value
)
)?
)
)*
\
s* #
trailing whitespace
"""
,
re
.
VERBOSE
)
endendtag
=
re
.
compile
(
'>'
)
endtagfind
=
re
.
compile
(
'</
\
s*([
a
-zA-Z][-.a-zA-Z0-9:_]*)
\
s*>
'
)
class HTMLParseError(Exception):
"""Exception raised for all parse errors."""
def __init__(self, msg, position=(None, None)):
assert msg
self.msg = msg
self.lineno = position[0]
self.offset = position[1]
def __str__(self):
result = self.msg
if self.lineno is not None:
result = result + ", at line %d" % self.lineno
if self.offset is not None:
result = result + ", column %d" % (self.offset + 1)
return result
def _contains_at(s, sub, pos):
return s[pos:pos+len(sub)] == sub
class HTMLParser(markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
Usage:
p = HTMLParser()
p.feed(data)
...
p.close()
Start tags are handled by calling self.handle_starttag() or
self.handle_startendtag(); end tags by self.handle_endtag(). The
data between tags is passed from the parser to the derived class
by calling self.handle_data() with the data as argument (the data
may be split up in arbitrary chunks). Entity references are
passed by calling self.handle_entityref() with the entity
reference as the argument. Numeric character references are
passed to self.handle_charref() with the string containing the
reference as the argument.
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self):
"""Initialize and reset this instance."""
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.stack = []
self.lasttag = '
???
'
self.interesting = interesting_normal
markupbase.ParserBase.reset(self)
def feed(self, data):
"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '
\
n
').
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
"""Handle any buffered data."""
self.goahead(1)
def error(self, message):
raise HTMLParseError(message, self.getpos())
__starttag_text = None
def get_starttag_text(self):
"""Return full source of start tag: '
<
...
>
'."""
return self.__starttag_text
cdata_endtag = None
def set_cdata_mode(self, endtag=None):
self.cdata_endtag = endtag
self.interesting = interesting_cdata
def clear_cdata_mode(self):
self.cdata_endtag = None
self.interesting = interesting_normal
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If '
end
' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
match = self.interesting.search(rawdata, i) # < or &
if match:
j = match.start()
else:
j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
if rawdata[i] == '
<
':
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
elif endtagopen.match(rawdata, i): # </
k = self.parse_endtag(i)
elif _contains_at(rawdata, "<!--", i): # <!--
k = self.parse_comment(i)
elif _contains_at(rawdata, "<!", i): # <!
k = self.parse_declaration(i)
elif _contains_at(rawdata, "<?", i): # <?
k = self.parse_pi(i)
elif _contains_at(rawdata, "<?", i): # <!
k = self.parse_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if end:
self.error("EOF in middle of construct")
break
i = self.updatepos(i, k)
elif rawdata[i:i+2] == "&#":
match = charref.match(rawdata, i)
if match:
name = match.group()[2:-1]
self.handle_charref(name)
k = match.end()
if rawdata[k-1] != '
;
':
k = k - 1
i = self.updatepos(i, k)
continue
else:
break
elif rawdata[i] == '
&
':
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
k = match.end()
if rawdata[k-1] != '
;
':
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete.match(rawdata, i)
if match:
# match.group() will contain at least 2 chars
rest = rawdata[i:]
if end and match.group() == rest:
self.error("EOF in middle of entity or char ref")
# incomplete
break
elif (i + 1) < n:
# not the end of the buffer, and can'
t
be
confused
# with some other construct
self
.
handle_data
(
"&"
)
i
=
self
.
updatepos
(
i
,
i
+
1
)
else
:
break
else
:
assert
0
,
"interesting.search() lied"
# end while
if
end
and
i
<
n
:
self
.
handle_data
(
rawdata
[
i
:
n
])
i
=
self
.
updatepos
(
i
,
n
)
self
.
rawdata
=
rawdata
[
i
:]
# Internal -- parse comment, return end or -1 if not terminated
def
parse_comment
(
self
,
i
,
report
=
1
):
rawdata
=
self
.
rawdata
assert
rawdata
[
i
:
i
+
4
]
==
'<!--'
,
'unexpected call to parse_comment()'
match
=
commentclose
.
search
(
rawdata
,
i
+
4
)
if
not
match
:
return
-
1
if
report
:
j
=
match
.
start
()
self
.
handle_comment
(
rawdata
[
i
+
4
:
j
])
j
=
match
.
end
()
return
j
# Internal -- parse processing instr, return end or -1 if not terminated
def
parse_pi
(
self
,
i
):
rawdata
=
self
.
rawdata
assert
rawdata
[
i
:
i
+
2
]
==
'<?'
,
'unexpected call to parse_pi()'
match
=
piclose
.
search
(
rawdata
,
i
+
2
)
# >
if
not
match
:
return
-
1
j
=
match
.
start
()
self
.
handle_pi
(
rawdata
[
i
+
2
:
j
])
j
=
match
.
end
()
return
j
# Internal -- handle starttag, return end or -1 if not terminated
def
parse_starttag
(
self
,
i
):
self
.
__starttag_text
=
None
endpos
=
self
.
check_for_whole_start_tag
(
i
)
if
endpos
<
0
:
return
endpos
rawdata
=
self
.
rawdata
self
.
__starttag_text
=
rawdata
[
i
:
endpos
]
# Now parse the data between i+1 and j into a tag and attrs
attrs
=
[]
match
=
tagfind
.
match
(
rawdata
,
i
+
1
)
assert
match
,
'unexpected call to parse_starttag()'
k
=
match
.
end
()
self
.
lasttag
=
tag
=
rawdata
[
i
+
1
:
k
].
lower
()
while
k
<
endpos
:
m
=
attrfind
.
match
(
rawdata
,
k
)
if
not
m
:
break
attrname
,
rest
,
attrvalue
=
m
.
group
(
1
,
2
,
3
)
if
not
rest
:
attrvalue
=
None
elif
attrvalue
[:
1
]
==
'
\
'
'
==
attrvalue
[
-
1
:]
or
\
attrvalue
[:
1
]
==
'"'
==
attrvalue
[
-
1
:]:
attrvalue
=
attrvalue
[
1
:
-
1
]
attrvalue
=
self
.
unescape
(
attrvalue
)
attrs
.
append
((
attrname
.
lower
(),
attrvalue
))
k
=
m
.
end
()
end
=
rawdata
[
k
:
endpos
].
strip
()
if
end
not
in
(
">"
,
"/>"
):
lineno
,
offset
=
self
.
getpos
()
if
"
\
n
"
in
self
.
__starttag_text
:
lineno
=
lineno
+
self
.
__starttag_text
.
count
(
"
\
n
"
)
offset
=
len
(
self
.
__starttag_text
)
\
-
self
.
__starttag_text
.
rfind
(
"
\
n
"
)
else
:
offset
=
offset
+
len
(
self
.
__starttag_text
)
self
.
error
(
"junk characters in start tag: %s"
%
`rawdata[k:endpos][:20]`
)
if
end
[
-
2
:]
==
'/>'
:
# XHTML-style empty tag: <span attr="value" />
self
.
handle_startendtag
(
tag
,
attrs
)
else
:
self
.
handle_starttag
(
tag
,
attrs
)
if
tag
in
self
.
CDATA_CONTENT_ELEMENTS
:
self
.
set_cdata_mode
(
tag
)
return
endpos
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
def
check_for_whole_start_tag
(
self
,
i
):
rawdata
=
self
.
rawdata
m
=
locatestarttagend
.
match
(
rawdata
,
i
)
if
m
:
j
=
m
.
end
()
next
=
rawdata
[
j
:
j
+
1
]
if
next
==
">"
:
return
j
+
1
if
next
==
"/"
:
s
=
rawdata
[
j
:
j
+
2
]
if
s
==
"/>"
:
return
j
+
2
if
s
==
"/"
:
# buffer boundary
return
-
1
# else bogus input
self
.
updatepos
(
i
,
j
+
1
)
self
.
error
(
"malformed empty start tag"
)
if
next
==
""
:
# end of input
return
-
1
if
next
in
(
"abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return
-
1
self
.
updatepos
(
i
,
j
)
self
.
error
(
"malformed start tag"
)
raise
AssertionError
(
"we should not get here!"
)
# Internal -- parse endtag, return end or -1 if incomplete
def
parse_endtag
(
self
,
i
):
rawdata
=
self
.
rawdata
assert
rawdata
[
i
:
i
+
2
]
==
"</"
,
"unexpected call to parse_endtag"
match
=
endendtag
.
search
(
rawdata
,
i
+
1
)
# >
if
not
match
:
return
-
1
j
=
match
.
end
()
match
=
endtagfind
.
match
(
rawdata
,
i
)
# </ + tag + >
if
not
match
:
self
.
error
(
"bad end tag: %s"
%
`rawdata[i:j]`
)
tag
=
match
.
group
(
1
).
lower
()
if
(
self
.
cdata_endtag
is
not
None
and
tag
!=
self
.
cdata_endtag
):
# Should be a mismatched end tag, but we'll treat it
# as text anyway, since most HTML authors aren't
# interested in the finer points of syntax.
self
.
handle_data
(
match
.
group
(
0
))
else
:
self
.
handle_endtag
(
tag
)
self
.
clear_cdata_mode
()
return
j
# Overridable -- finish processing of start+end tag: <tag.../>
def
handle_startendtag
(
self
,
tag
,
attrs
):
self
.
handle_starttag
(
tag
,
attrs
)
self
.
handle_endtag
(
tag
)
# Overridable -- handle start tag
def
handle_starttag
(
self
,
tag
,
attrs
):
pass
# Overridable -- handle end tag
def
handle_endtag
(
self
,
tag
):
pass
# Overridable -- handle character reference
def
handle_charref
(
self
,
name
):
pass
# Overridable -- handle entity reference
def
handle_entityref
(
self
,
name
):
pass
# Overridable -- handle data
def
handle_data
(
self
,
data
):
pass
# Overridable -- handle comment
def
handle_comment
(
self
,
data
):
pass
# Overridable -- handle declaration
def
handle_decl
(
self
,
decl
):
pass
# Overridable -- handle processing instruction
def
handle_pi
(
self
,
data
):
pass
def
unknown_decl
(
self
,
data
):
self
.
error
(
"unknown declaration: "
+
`data`
)
# Internal -- helper to remove special character quoting
def
unescape
(
self
,
s
):
if
'&'
not
in
s
:
return
s
s
=
s
.
replace
(
"<"
,
"<"
)
s
=
s
.
replace
(
">"
,
">"
)
s
=
s
.
replace
(
"'"
,
"'"
)
s
=
s
.
replace
(
"""
,
'"'
)
s
=
s
.
replace
(
"&"
,
"&"
)
# Must be last
return
s
lib/python/TAL/tests/test_htmlparser.py
deleted
100755 → 0
View file @
3c86de13
#! /usr/bin/env python1.5
"""Tests for HTMLParser.py."""
import
sys
from
TAL.tests
import
utils
import
unittest
from
TAL
import
HTMLParser
class
EventCollector
(
HTMLParser
.
HTMLParser
):
def
__init__
(
self
):
self
.
events
=
[]
self
.
append
=
self
.
events
.
append
HTMLParser
.
HTMLParser
.
__init__
(
self
)
def
get_events
(
self
):
# Normalize the list of events so that buffer artefacts don't
# separate runs of contiguous characters.
L
=
[]
prevtype
=
None
for
event
in
self
.
events
:
type
=
event
[
0
]
if
type
==
prevtype
==
"data"
:
L
[
-
1
]
=
(
"data"
,
L
[
-
1
][
1
]
+
event
[
1
])
else
:
L
.
append
(
event
)
prevtype
=
type
self
.
events
=
L
return
L
# structure markup
def
handle_starttag
(
self
,
tag
,
attrs
):
self
.
append
((
"starttag"
,
tag
,
attrs
))
def
handle_startendtag
(
self
,
tag
,
attrs
):
self
.
append
((
"startendtag"
,
tag
,
attrs
))
def
handle_endtag
(
self
,
tag
):
self
.
append
((
"endtag"
,
tag
))
# all other markup
def
handle_comment
(
self
,
data
):
self
.
append
((
"comment"
,
data
))
def
handle_charref
(
self
,
data
):
self
.
append
((
"charref"
,
data
))
def
handle_data
(
self
,
data
):
self
.
append
((
"data"
,
data
))
def
handle_decl
(
self
,
data
):
self
.
append
((
"decl"
,
data
))
def
handle_entityref
(
self
,
data
):
self
.
append
((
"entityref"
,
data
))
def
handle_pi
(
self
,
data
):
self
.
append
((
"pi"
,
data
))
def
unknown_decl
(
self
,
decl
):
self
.
append
((
"unknown decl"
,
decl
))
class
EventCollectorExtra
(
EventCollector
):
def
handle_starttag
(
self
,
tag
,
attrs
):
EventCollector
.
handle_starttag
(
self
,
tag
,
attrs
)
self
.
append
((
"starttag_text"
,
self
.
get_starttag_text
()))
class
TestCaseBase
(
unittest
.
TestCase
):
# Constant pieces of source and events
prologue
=
""
epilogue
=
""
initial_events
=
[]
final_events
=
[]
def
_run_check
(
self
,
source
,
events
,
collector
=
EventCollector
):
parser
=
collector
()
parser
.
feed
(
self
.
prologue
)
for
s
in
source
:
parser
.
feed
(
s
)
for
c
in
self
.
epilogue
:
parser
.
feed
(
c
)
parser
.
close
()
self
.
assert_
(
parser
.
get_events
()
==
self
.
initial_events
+
events
+
self
.
final_events
,
parser
.
get_events
())
def
_run_check_extra
(
self
,
source
,
events
):
self
.
_run_check
(
source
,
events
,
EventCollectorExtra
)
def
_parse_error
(
self
,
source
):
def
parse
(
source
=
source
):
parser
=
HTMLParser
.
HTMLParser
()
parser
.
feed
(
source
)
parser
.
close
()
self
.
assertRaises
(
HTMLParser
.
HTMLParseError
,
parse
)
class
HTMLParserTestCase
(
TestCaseBase
):
def
check_processing_instruction_only
(
self
):
self
.
_run_check
(
"<?processing instruction>"
,
[
(
"pi"
,
"processing instruction"
),
])
def
check_simple_html
(
self
):
self
.
_run_check
(
"""
<!DOCTYPE html PUBLIC 'foo'>
<HTML>&entity; 
<!--comment1a
-></foo><bar><<?pi?></foo<bar
comment1b-->
<Img sRc='Bar' isMAP>sample
text
“
<!--comment2a-- --comment2b-->
</Html>
"""
,
[
(
"data"
,
"
\
n
"
),
(
"decl"
,
"DOCTYPE html PUBLIC 'foo'"
),
(
"data"
,
"
\
n
"
),
(
"starttag"
,
"html"
,
[]),
(
"entityref"
,
"entity"
),
(
"charref"
,
"32"
),
(
"data"
,
"
\
n
"
),
(
"comment"
,
"comment1a
\
n
-></foo><bar><<?pi?></foo<bar
\
n
comment1b"
),
(
"data"
,
"
\
n
"
),
(
"starttag"
,
"img"
,
[(
"src"
,
"Bar"
),
(
"ismap"
,
None
)]),
(
"data"
,
"sample
\
n
text
\
n
"
),
(
"charref"
,
"x201C"
),
(
"data"
,
"
\
n
"
),
(
"comment"
,
"comment2a-- --comment2b"
),
(
"data"
,
"
\
n
"
),
(
"endtag"
,
"html"
),
(
"data"
,
"
\
n
"
),
])
def
check_unclosed_entityref
(
self
):
self
.
_run_check
(
"&entityref foo"
,
[
(
"entityref"
,
"entityref"
),
(
"data"
,
" foo"
),
])
def
check_doctype_decl
(
self
):
inside
=
"""
\
DOCTYPE html [
<!ELEMENT html - O EMPTY>
<!ATTLIST html
version CDATA #IMPLIED
profile CDATA 'DublinCore'>
<!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
<!ENTITY myEntity 'internal parsed entity'>
<!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
<!ENTITY % paramEntity 'name|name|name'>
%paramEntity;
<!-- comment -->
]"""
self
.
_run_check
(
"<!%s>"
%
inside
,
[
(
"decl"
,
inside
),
])
def
check_bad_nesting
(
self
):
# Strangely, this *is* supposed to test that overlapping
# elements are allowed. HTMLParser is more geared toward
# lexing the input that parsing the structure.
self
.
_run_check
(
"<a><b></a></b>"
,
[
(
"starttag"
,
"a"
,
[]),
(
"starttag"
,
"b"
,
[]),
(
"endtag"
,
"a"
),
(
"endtag"
,
"b"
),
])
def
check_bare_ampersands
(
self
):
self
.
_run_check
(
"this text & contains & ampersands &"
,
[
(
"data"
,
"this text & contains & ampersands &"
),
])
def
check_bare_pointy_brackets
(
self
):
self
.
_run_check
(
"this < text > contains < bare>pointy< brackets"
,
[
(
"data"
,
"this < text > contains < bare>pointy< brackets"
),
])
def
check_attr_syntax
(
self
):
output
=
[
(
"starttag"
,
"a"
,
[(
"b"
,
"v"
),
(
"c"
,
"v"
),
(
"d"
,
"v"
),
(
"e"
,
None
)])
]
self
.
_run_check
(
"""<a b='v' c="v" d=v e>"""
,
output
)
self
.
_run_check
(
"""<a b = 'v' c = "v" d = v e>"""
,
output
)
self
.
_run_check
(
"""<a
\
n
b
\
n
=
\
n
'v'
\
n
c
\
n
=
\
n
"v"
\
n
d
\
n
=
\
n
v
\
n
e>"""
,
output
)
self
.
_run_check
(
"""<a
\
t
b
\
t
=
\
t
'v'
\
t
c
\
t
=
\
t
"v"
\
t
d
\
t
=
\
t
v
\
t
e>"""
,
output
)
def
check_attr_values
(
self
):
self
.
_run_check
(
"""<a b='xxx
\
n
\
t
xxx' c="yyy
\
t
\
n
yyy" d='
\
t
xyz
\
n
'>"""
,
[(
"starttag"
,
"a"
,
[(
"b"
,
"xxx
\
n
\
t
xxx"
),
(
"c"
,
"yyy
\
t
\
n
yyy"
),
(
"d"
,
"
\
t
xyz
\
n
"
)])
])
self
.
_run_check
(
"""<a b='' c="">"""
,
[
(
"starttag"
,
"a"
,
[(
"b"
,
""
),
(
"c"
,
""
)]),
])
def
check_attr_entity_replacement
(
self
):
self
.
_run_check
(
"""<a b='&><"''>"""
,
[
(
"starttag"
,
"a"
,
[(
"b"
,
"&><
\
"
'"
)]),
])
def
check_attr_funky_names
(
self
):
self
.
_run_check
(
"""<a a.b='v' c:d=v e-f=v>"""
,
[
(
"starttag"
,
"a"
,
[(
"a.b"
,
"v"
),
(
"c:d"
,
"v"
),
(
"e-f"
,
"v"
)]),
])
def
check_illegal_declarations
(
self
):
self
.
_parse_error
(
'<!spacer type="block" height="25">'
)
def
check_starttag_end_boundary
(
self
):
self
.
_run_check
(
"""<a b='<'>"""
,
[(
"starttag"
,
"a"
,
[(
"b"
,
"<"
)])])
self
.
_run_check
(
"""<a b='>'>"""
,
[(
"starttag"
,
"a"
,
[(
"b"
,
">"
)])])
def
check_buffer_artefacts
(
self
):
output
=
[(
"starttag"
,
"a"
,
[(
"b"
,
"<"
)])]
self
.
_run_check
([
"<a b='<'>"
],
output
)
self
.
_run_check
([
"<a "
,
"b='<'>"
],
output
)
self
.
_run_check
([
"<a b"
,
"='<'>"
],
output
)
self
.
_run_check
([
"<a b="
,
"'<'>"
],
output
)
self
.
_run_check
([
"<a b='<"
,
"'>"
],
output
)
self
.
_run_check
([
"<a b='<'"
,
">"
],
output
)
output
=
[(
"starttag"
,
"a"
,
[(
"b"
,
">"
)])]
self
.
_run_check
([
"<a b='>'>"
],
output
)
self
.
_run_check
([
"<a "
,
"b='>'>"
],
output
)
self
.
_run_check
([
"<a b"
,
"='>'>"
],
output
)
self
.
_run_check
([
"<a b="
,
"'>'>"
],
output
)
self
.
_run_check
([
"<a b='>"
,
"'>"
],
output
)
self
.
_run_check
([
"<a b='>'"
,
">"
],
output
)
def
check_starttag_junk_chars
(
self
):
self
.
_parse_error
(
"</>"
)
self
.
_parse_error
(
"</$>"
)
self
.
_parse_error
(
"</"
)
self
.
_parse_error
(
"</a"
)
self
.
_parse_error
(
"<a<a>"
)
self
.
_parse_error
(
"</a<a>"
)
self
.
_parse_error
(
"<!"
)
self
.
_parse_error
(
"<a $>"
)
self
.
_parse_error
(
"<a"
)
self
.
_parse_error
(
"<a foo='bar'"
)
self
.
_parse_error
(
"<a foo='bar"
)
self
.
_parse_error
(
"<a foo='>'"
)
self
.
_parse_error
(
"<a foo='>"
)
self
.
_parse_error
(
"<a foo=>"
)
def
check_declaration_junk_chars
(
self
):
self
.
_parse_error
(
"<!DOCTYPE foo $ >"
)
def
check_startendtag
(
self
):
self
.
_run_check
(
"<p/>"
,
[
(
"startendtag"
,
"p"
,
[]),
])
self
.
_run_check
(
"<p></p>"
,
[
(
"starttag"
,
"p"
,
[]),
(
"endtag"
,
"p"
),
])
self
.
_run_check
(
"<p><img src='foo' /></p>"
,
[
(
"starttag"
,
"p"
,
[]),
(
"startendtag"
,
"img"
,
[(
"src"
,
"foo"
)]),
(
"endtag"
,
"p"
),
])
def
check_get_starttag_text
(
self
):
s
=
"""<foo:bar
\
n
one="1"
\
t
two=2 >"""
self
.
_run_check_extra
(
s
,
[
(
"starttag"
,
"foo:bar"
,
[(
"one"
,
"1"
),
(
"two"
,
"2"
)]),
(
"starttag_text"
,
s
)])
def
check_cdata_content
(
self
):
s
=
"""<script> <!-- not a comment --> ¬-an-entity-ref; </script>"""
self
.
_run_check
(
s
,
[
(
"starttag"
,
"script"
,
[]),
(
"data"
,
" <!-- not a comment --> ¬-an-entity-ref; "
),
(
"endtag"
,
"script"
),
])
s
=
"""<script> <not a='start tag'> </script>"""
self
.
_run_check
(
s
,
[
(
"starttag"
,
"script"
,
[]),
(
"data"
,
" <not a='start tag'> "
),
(
"endtag"
,
"script"
),
])
def
check_enumerated_attr_type
(
self
):
s
=
"<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
self
.
_run_check
(
s
,
[
(
'decl'
,
'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'
),
])
# Support for the Zope regression test framework:
def
test_suite
():
suite
=
unittest
.
TestSuite
()
suite
.
addTest
(
unittest
.
makeSuite
(
HTMLParserTestCase
,
"check_"
))
return
suite
if
__name__
==
"__main__"
:
errs
=
utils
.
run_suite
(
test_suite
())
sys
.
exit
(
errs
and
1
or
0
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment