Commit 73728855 authored by Fred Drake's avatar Fred Drake

Add general support for CDATA element content, enabled by default for

<script> and <style> elements since those are the ones I found in HTML 4.01.
parent cd292de0
...@@ -13,7 +13,8 @@ import string ...@@ -13,7 +13,8 @@ import string
# Regular expressions used for parsing # Regular expressions used for parsing
interesting = re.compile('[&<]') interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile('</')
incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?') incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
...@@ -85,9 +86,11 @@ class HTMLParseError(Exception): ...@@ -85,9 +86,11 @@ class HTMLParseError(Exception):
class HTMLParser: class HTMLParser:
CDATA_CONTENT_ELEMENTS = ("script", "style")
# Interface -- initialize and reset this instance # Interface -- initialize and reset this instance
def __init__(self, verbose=0): def __init__(self):
self.verbose = verbose
self.reset() self.reset()
# Interface -- reset this instance. Loses all unprocessed data # Interface -- reset this instance. Loses all unprocessed data
...@@ -97,6 +100,7 @@ class HTMLParser: ...@@ -97,6 +100,7 @@ class HTMLParser:
self.lasttag = '???' self.lasttag = '???'
self.lineno = 1 self.lineno = 1
self.offset = 0 self.offset = 0
self.interesting = interesting_normal
# Interface -- feed some data to the parser. Call this as # Interface -- feed some data to the parser. Call this as
# often as you want, with as little or as much text as you # often as you want, with as little or as much text as you
...@@ -137,6 +141,9 @@ class HTMLParser: ...@@ -137,6 +141,9 @@ class HTMLParser:
def get_starttag_text(self): def get_starttag_text(self):
return self.__starttag_text return self.__starttag_text
def set_cdata_mode(self):
self.interesting = interesting_cdata
# Internal -- handle data as far as reasonable. May leave state # Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is # and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker. # true, force handling all data as if followed by EOF marker.
...@@ -145,9 +152,12 @@ class HTMLParser: ...@@ -145,9 +152,12 @@ class HTMLParser:
i = 0 i = 0
n = len(rawdata) n = len(rawdata)
while i < n: while i < n:
match = interesting.search(rawdata, i) # < or & match = self.interesting.search(rawdata, i) # < or &
if match: j = match.start() if match:
else: j = n j = match.start()
self.interesting = interesting_normal
else:
j = n
if i < j: self.handle_data(rawdata[i:j]) if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j) i = self.updatepos(i, j)
if i == n: break if i == n: break
...@@ -312,6 +322,8 @@ class HTMLParser: ...@@ -312,6 +322,8 @@ class HTMLParser:
self.handle_startendtag(tag, attrs) self.handle_startendtag(tag, attrs)
else: else:
self.handle_starttag(tag, attrs) self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode()
return endpos return endpos
# Internal -- check to see if we have a complete starttag; return end # Internal -- check to see if we have a complete starttag; return end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment