Commit c9ef5d5c authored by Guido van Rossum's avatar Guido van Rossum

Keep track of current line number and offset. Call self.getpos() to

retrieve.
parent 926286f3
...@@ -63,6 +63,8 @@ class SGMLParser: ...@@ -63,6 +63,8 @@ class SGMLParser:
self.lasttag = '???' self.lasttag = '???'
self.nomoretags = 0 self.nomoretags = 0
self.literal = 0 self.literal = 0
self.lineno = 1
self.offset = 0
# For derived classes only -- enter literal mode (CDATA) till EOF # For derived classes only -- enter literal mode (CDATA) till EOF
def setnomoretags(self): def setnomoretags(self):
...@@ -84,6 +86,27 @@ class SGMLParser: ...@@ -84,6 +86,27 @@ class SGMLParser:
def close(self): def close(self):
self.goahead(1) self.goahead(1)
# Internal -- update line number and offset. This should be
# called for each piece of data exactly once, in order -- in other
# words the concatenation of all the input strings to this
# function should be exactly the entire input.
def updatepos(self, i, j):
if i >= j:
return j
rawdata = self.rawdata
nlines = string.count(rawdata, "\n", i, j)
if nlines:
self.lineno = self.lineno + nlines
pos = string.rindex(rawdata, "\n", i, j) # Should not fail
self.offset = j-(pos+1)
else:
self.offset = self.offset + j-i
return j
# Interface -- return current line number and offset.
def getpos(self):
return self.lineno, self.offset
# Internal -- handle data as far as reasonable. May leave state # Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is # and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker. # true, force handling all data as if followed by EOF marker.
...@@ -94,70 +117,75 @@ class SGMLParser: ...@@ -94,70 +117,75 @@ class SGMLParser:
while i < n: while i < n:
if self.nomoretags: if self.nomoretags:
self.handle_data(rawdata[i:n]) self.handle_data(rawdata[i:n])
i = n i = self.updatepos(i, n)
break break
match = interesting.search(rawdata, i) match = interesting.search(rawdata, i)
if match: j = match.start(0) if match: j = match.start(0)
else: j = n else: j = n
if i < j: self.handle_data(rawdata[i:j]) if i < j: self.handle_data(rawdata[i:j])
i = j i = self.updatepos(i, j)
if i == n: break if i == n: break
if rawdata[i] == '<': if rawdata[i] == '<':
if starttagopen.match(rawdata, i): if starttagopen.match(rawdata, i):
if self.literal: if self.literal:
self.handle_data(rawdata[i]) self.handle_data(rawdata[i])
i = i+1 i = self.updatepos(i, i+1)
continue continue
k = self.parse_starttag(i) k = self.parse_starttag(i)
if k < 0: break if k < 0: break
i = k i = self.updatepos(i, k)
continue continue
if endtagopen.match(rawdata, i): if endtagopen.match(rawdata, i):
k = self.parse_endtag(i) k = self.parse_endtag(i)
if k < 0: break if k < 0: break
i = k i = self.updatepos(i, k)
self.literal = 0 self.literal = 0
continue continue
if commentopen.match(rawdata, i): if commentopen.match(rawdata, i):
if self.literal: if self.literal:
self.handle_data(rawdata[i]) self.handle_data(rawdata[i])
i = i+1 i = self.updatepos(i, i+1)
continue continue
k = self.parse_comment(i) k = self.parse_comment(i)
if k < 0: break if k < 0: break
i = i+k i = self.updatepos(i, i+k)
continue continue
if piopen.match(rawdata, i): if piopen.match(rawdata, i):
if self.literal: if self.literal:
self.handle_data(rawdata[i]) self.handle_data(rawdata[i])
i = i+1 i = self.updatepos(i, i+1)
continue continue
k = self.parse_pi(i) k = self.parse_pi(i)
if k < 0: break if k < 0: break
i = i+k i = self.updatepos(i, i+k)
continue continue
match = special.match(rawdata, i) match = special.match(rawdata, i)
if match: if match:
if self.literal: if self.literal:
self.handle_data(rawdata[i]) self.handle_data(rawdata[i])
i = i+1 i = self.updatepos(i, i+1)
continue continue
i = match.end(0) k = match.end(0)
i = self.updatepos(i, k)
continue continue
elif rawdata[i] == '&': elif rawdata[i] == '&':
match = charref.match(rawdata, i) match = charref.match(rawdata, i)
if match: if match:
name = match.group(1) name = match.group(1)
self.handle_charref(name) self.handle_charref(name)
i = match.end(0) k = match.end(0)
if rawdata[i-1] != ';': i = i-1 if rawdata[i-1] != ';':
k = k-1
i = self.updatepos(i, k)
continue continue
match = entityref.match(rawdata, i) match = entityref.match(rawdata, i)
if match: if match:
name = match.group(1) name = match.group(1)
self.handle_entityref(name) self.handle_entityref(name)
i = match.end(0) k = match.end(0)
if rawdata[i-1] != ';': i = i-1 if rawdata[i-1] != ';':
k = k-1
i = self.updatepos(i, k)
continue continue
else: else:
raise RuntimeError, 'neither < nor & ??' raise RuntimeError, 'neither < nor & ??'
...@@ -166,17 +194,17 @@ class SGMLParser: ...@@ -166,17 +194,17 @@ class SGMLParser:
match = incomplete.match(rawdata, i) match = incomplete.match(rawdata, i)
if not match: if not match:
self.handle_data(rawdata[i]) self.handle_data(rawdata[i])
i = i+1 i = self.updatepos(i, i+1)
continue continue
j = match.end(0) j = match.end(0)
if j == n: if j == n:
break # Really incomplete break # Really incomplete
self.handle_data(rawdata[i:j]) self.handle_data(rawdata[i:j])
i = j i = self.updatepos(self, i, j)
# end while # end while
if end and i < n: if end and i < n:
self.handle_data(rawdata[i:n]) self.handle_data(rawdata[i:n])
i = n i = self.updatepos(i, n)
self.rawdata = rawdata[i:] self.rawdata = rawdata[i:]
# XXX if end: check for empty stack # XXX if end: check for empty stack
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment