Commit 8447707c authored by Fred Drake's avatar Fred Drake

locatestarttagend: Completely re-write the expression so to be much

    more strict about matchnig only what's legal.  The expression ends
    up being a bit more complex, and needs additional checks to be
    done on what follows.

HTMLParser.check_for_whole_start_tag():  Helper method that uses
    locatestarttagend, performs the required additional checks, and
    determines whether we've actually found the end of the start tag,
    are at a buffer boundary, or have encountered an syntactical
    error.


HTMLParser.parse_starttag():  Use check_for_whole_start_tag() to see
    if we really have the start tag.

HTMLParseError.__init__():  Simplify assertion.


This should close ZPT(18).
parent e37dbe4f
...@@ -32,7 +32,20 @@ attrfind = re.compile( ...@@ -32,7 +32,20 @@ attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?') r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
locatestarttagend = re.compile("('[^']*'|\"[^\"]*\"|[^'\">]+)*/?>") locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
endstarttag = re.compile(r"\s*/?>") endstarttag = re.compile(r"\s*/?>")
endendtag = re.compile('>') endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
...@@ -45,7 +58,7 @@ class HTMLParseError(Exception): ...@@ -45,7 +58,7 @@ class HTMLParseError(Exception):
"""Exception raised for all parse errors.""" """Exception raised for all parse errors."""
def __init__(self, msg, position=(None, None)): def __init__(self, msg, position=(None, None)):
assert msg != "" assert msg
self.msg = msg self.msg = msg
self.lineno = position[0] self.lineno = position[0]
self.offset = position[1] self.offset = position[1]
...@@ -255,11 +268,10 @@ class HTMLParser: ...@@ -255,11 +268,10 @@ class HTMLParser:
# Internal -- handle starttag, return end or -1 if not terminated # Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i): def parse_starttag(self, i):
self.__starttag_text = None self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata rawdata = self.rawdata
m = locatestarttagend.match(rawdata, i) # > outside quotes
if not m:
return -1
endpos = m.end()
self.__starttag_text = rawdata[i:endpos] self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs # Now parse the data between i+1 and j into a tag and attrs
...@@ -275,7 +287,7 @@ class HTMLParser: ...@@ -275,7 +287,7 @@ class HTMLParser:
break break
attrname, rest, attrvalue = m.group(1, 2, 3) attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest: if not rest:
attrvalue = attrname attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1] attrvalue = attrvalue[1:-1]
...@@ -302,6 +314,29 @@ class HTMLParser: ...@@ -302,6 +314,29 @@ class HTMLParser:
self.handle_starttag(tag, attrs) self.handle_starttag(tag, attrs)
return endpos return endpos
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
if next == ">":
return j + 1
if rawdata[j:j+2] == "/>":
return j + 2
if next == "":
# end of input
return -1
if next in ("abcdefghijklmnopqrstuvwxyz="
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# end of input in or before attribute value
return -1
self.updatepos(i, j)
raise HTMLParseError("malformed start tag", self.getpos())
raise AssertionError("we should not gt here!")
# Internal -- parse endtag, return end or -1 if incomplete # Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i): def parse_endtag(self, i):
rawdata = self.rawdata rawdata = self.rawdata
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment