Update to newest Zope 3.3 branch which contains necessary bugfixes.

Also update to newest Five 1.5 (trunk) which has been adjusted to work
with newest testbrowser.
parent f212d08c
...@@ -15,7 +15,7 @@ HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) ...@@ -15,7 +15,7 @@ HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
HTML 4.01 Specification, W3C Recommendation 24 December 1999 HTML 4.01 Specification, W3C Recommendation 24 December 1999
Copyright 2002-2005 John J. Lee <jjl@pobox.com> Copyright 2002-2006 John J. Lee <jjl@pobox.com>
Copyright 2005 Gary Poster Copyright 2005 Gary Poster
Copyright 2005 Zope Corporation Copyright 2005 Zope Corporation
Copyright 1998-2000 Gisle Aas. Copyright 1998-2000 Gisle Aas.
...@@ -27,44 +27,40 @@ the distribution). ...@@ -27,44 +27,40 @@ the distribution).
""" """
# XXX # XXX
# Remove unescape_attr method
# Remove parser testing hack
# safeUrl()-ize action
# Really should to merge CC, CF, pp and mechanize as soon as mechanize
# goes to beta...
# Add url attribute to ParseError
# Switch to unicode throughout (would be 0.3.x)
# See Wichert Akkerman's 2004-01-22 message to c.l.py.
# Add charset parameter to Content-type headers? How to find value??
# Add some more functional tests # Add some more functional tests
# Especially single and multiple file upload on the internet. # Especially single and multiple file upload on the internet.
# Does file upload work when name is missing? Sourceforge tracker form # Does file upload work when name is missing? Sourceforge tracker form
# doesn't like it. Check standards, and test with Apache. Test # doesn't like it. Check standards, and test with Apache. Test
# binary upload with Apache. # binary upload with Apache.
# There have been reports that some servers are very picky about MIME
# boundaries, so file uploads may fail with those servers. Should
# copy what IE does religiously.
# Unicode: see Wichert Akkerman's 2004-01-22 message to c.l.py.
# Controls can have name=None (e.g. forms constructed partly with # Controls can have name=None (e.g. forms constructed partly with
# JavaScript), but find_control can't be told to find a control # JavaScript), but find_control can't be told to find a control
# with that name, because None there means 'unspecified'. Can still # with that name, because None there means 'unspecified'. Can still
# get at by nr, but would be nice to be able to specify something # get at by nr, but would be nice to be able to specify something
# equivalent to name=None, too. # equivalent to name=None, too.
# Deal with character sets properly. Not sure what the issues are here.
# Do URL encodings need any attention?
# I don't *think* any encoding of control names, filenames or data is
# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6
# doesn't seem to do it.
# Add charset parameter to Content-type headers? How to find value??
# mailto submission & enctype text/plain # mailto submission & enctype text/plain
# I'm not going to fix this unless somebody tells me what real servers # I'm not going to fix this unless somebody tells me what real servers
# that want this encoding actually expect: If enctype is # that want this encoding actually expect: If enctype is
# application/x-www-form-urlencoded and there's a FILE control present. # application/x-www-form-urlencoded and there's a FILE control present.
# Strictly, it should be 'name=data' (see HTML 4.01 spec., section # Strictly, it should be 'name=data' (see HTML 4.01 spec., section
# 17.13.2), but I send "name=" ATM. What about multiple file upload?? # 17.13.2), but I send "name=" ATM. What about multiple file upload??
# Get rid of MimeWriter.
# Should really use sgmllib, not htmllib.
# Would be nice, but I'm not going to do it myself: # Would be nice, but I'm not going to do it myself:
# ------------------------------------------------- # -------------------------------------------------
# Maybe a 0.3.x? # Maybe a 0.4.x?
# Replace by_label etc. with moniker / selector concept. Allows, eg., # Replace by_label etc. with moniker / selector concept. Allows, eg.,
# a choice between selection by value / id / label / element # a choice between selection by value / id / label / element
# contents. Or choice between matching labels exactly or by # contents. Or choice between matching labels exactly or by
# substring. Etc. # substring. Etc.
# Remove deprecated methods. # Remove deprecated methods.
# action should probably be an absolute URI, like DOMForm.
# ...what else? # ...what else?
# Work on DOMForm. # Work on DOMForm.
# XForms? Don't know if there's a need here. # XForms? Don't know if there's a need here.
...@@ -81,8 +77,38 @@ except NameError: ...@@ -81,8 +77,38 @@ except NameError:
if expr: return True if expr: return True
else: return False else: return False
try:
import logging
except ImportError:
def debug(msg, *args, **kwds):
pass
else:
_logger = logging.getLogger("ClientForm")
OPTIMIZATION_HACK = True
def debug(msg, *args, **kwds):
if OPTIMIZATION_HACK:
return
try:
raise Exception()
except:
caller_name = (
sys.exc_info()[2].tb_frame.f_back.f_back.f_code.co_name)
extended_msg = '%%s %s' % msg
extended_args = (caller_name,)+args
debug = _logger.debug(extended_msg, *extended_args, **kwds)
def _show_debug_messages():
global OPTIMIZATION_HACK
OPTIMIZATION_HACK = False
_logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
_logger.addHandler(handler)
import sys, urllib, urllib2, types, mimetools, copy, urlparse, \ import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
htmlentitydefs, re htmlentitydefs, re, random
from urlparse import urljoin from urlparse import urljoin
from cStringIO import StringIO from cStringIO import StringIO
...@@ -95,10 +121,12 @@ else: ...@@ -95,10 +121,12 @@ else:
def deprecation(message): def deprecation(message):
warnings.warn(message, DeprecationWarning, stacklevel=2) warnings.warn(message, DeprecationWarning, stacklevel=2)
VERSION = "0.2.1a" VERSION = "0.2.2"
CHUNK = 1024 # size of chunks fed to parser, in bytes CHUNK = 1024 # size of chunks fed to parser, in bytes
DEFAULT_ENCODING = "latin-1"
_compress_re = re.compile(r"\s+") _compress_re = re.compile(r"\s+")
def compress_text(text): return _compress_re.sub(" ", text.strip()) def compress_text(text): return _compress_re.sub(" ", text.strip())
...@@ -171,14 +199,61 @@ string. ...@@ -171,14 +199,61 @@ string.
l.append(k + '=' + urllib.quote_plus(str(elt))) l.append(k + '=' + urllib.quote_plus(str(elt)))
return '&'.join(l) return '&'.join(l)
def unescape(data, entities): def unescape(data, entities, encoding=DEFAULT_ENCODING):
if data is None or '&' not in data: if data is None or "&" not in data:
return data return data
def replace_entities(match, entities=entities):
def replace_entities(match, entities=entities, encoding=encoding):
ent = match.group() ent = match.group()
repl = entities.get(ent, ent) if ent[1] == "#":
return unescape_charref(ent[2:-1], encoding)
repl = entities.get(ent)
if repl is not None:
if type(repl) != type(""):
try:
repl = repl.encode(encoding)
except UnicodeError:
repl = ent
else:
repl = ent
return repl
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
def unescape_charref(data, encoding):
name, base = data, 10
if name.startswith("x"):
name, base= name[1:], 16
uc = unichr(int(name, base))
if encoding is None:
return uc
else:
try:
repl = uc.encode(encoding)
except UnicodeError:
repl = "&#%s;" % data
return repl return repl
return re.sub(r'&\S+?;', replace_entities, data)
def get_entitydefs():
import htmlentitydefs
from codecs import latin_1_decode
entitydefs = {}
try:
htmlentitydefs.name2codepoint
except AttributeError:
entitydefs = {}
for name, char in htmlentitydefs.entitydefs.items():
uc = latin_1_decode(char)[0]
if uc.startswith("&#") and uc.endswith(";"):
uc = unescape_charref(uc[2:-1], None)
entitydefs["&%s;" % name] = uc
else:
for name, codepoint in htmlentitydefs.name2codepoint.items():
entitydefs["&%s;" % name] = unichr(codepoint)
return entitydefs
def issequence(x): def issequence(x):
try: try:
...@@ -195,74 +270,15 @@ def isstringlike(x): ...@@ -195,74 +270,15 @@ def isstringlike(x):
else: return True else: return True
# XXX don't really want to drag this along (MimeWriter, choose_boundary)
# --------------------------------------------------------------------
# grabbed from Python standard library mimetools module and tweaked to
# avoid socket.gaierror and to avoid dots ('.') in MIME boundaries
try:
import thread
_thread = thread; del thread
except ImportError:
import dummy_thread
_thread = dummy_thread; del dummy_thread
_counter_lock = _thread.allocate_lock()
del _thread
_counter = 0
def _get_next_counter():
global _counter
_counter_lock.acquire()
_counter = _counter + 1
result = _counter
_counter_lock.release()
return result
_prefix = None
def choose_boundary(): def choose_boundary():
"""Return a string usable as a multipart boundary. """Return a string usable as a multipart boundary."""
# follow IE and firefox
The string chosen is unique within a single program run, and nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
incorporates the user id (if available), process id (if available), return "-"*27 + nonce
and current time. So it's very unlikely the returned string appears
in message text, but there's no guarantee.
The boundary contains dots so you have to quote it in the header."""
global _prefix
import time
import os
import socket
if _prefix is None:
try:
socket.gaierror
except AttributeError:
exc = socket.error
else:
exc = socket.gaierror
try:
hostid = socket.gethostbyname(socket.gethostname())
except exc:
hostid = 'localhost'
try:
uid = repr(os.getuid())
except AttributeError:
uid = '1'
try:
pid = repr(os.getpid())
except AttributeError:
pid = '1'
_prefix = hostid + uid + pid
return "%s%d%d" % (_prefix, long(time.time()*100), _get_next_counter())
# end of code from mimetools module
# --------------------------------------------------------------------
# This cut-n-pasted MimeWriter from standard library is here so can add # This cut-n-pasted MimeWriter from standard library is here so can add
# to HTTP headers rather than message body when appropriate. It also uses # to HTTP headers rather than message body when appropriate. It also uses
# \r\n in place of \n. This is nasty. # \r\n in place of \n. This is a bit nasty.
class MimeWriter: class MimeWriter:
"""Generic MIME writer. """Generic MIME writer.
...@@ -420,10 +436,11 @@ class ParseError(Exception): pass ...@@ -420,10 +436,11 @@ class ParseError(Exception): pass
class _AbstractFormParser: class _AbstractFormParser:
"""forms attribute contains HTMLForm instances on completion.""" """forms attribute contains HTMLForm instances on completion."""
# thanks to Moshe Zadka for an example of sgmllib/htmllib usage # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
def __init__(self, entitydefs=None): def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
if entitydefs is None: if entitydefs is None:
entitydefs = get_entitydefs() entitydefs = get_entitydefs()
self._entitydefs = entitydefs self._entitydefs = entitydefs
self._encoding = encoding
self.base = None self.base = None
self.forms = [] self.forms = []
...@@ -436,17 +453,20 @@ class _AbstractFormParser: ...@@ -436,17 +453,20 @@ class _AbstractFormParser:
self._textarea = None self._textarea = None
def do_base(self, attrs): def do_base(self, attrs):
debug("%s", attrs)
for key, value in attrs: for key, value in attrs:
if key == "href": if key == "href":
self.base = value self.base = value
def end_body(self): def end_body(self):
debug("")
if self._current_label is not None: if self._current_label is not None:
self.end_label() self.end_label()
if self._current_form is not None: if self._current_form is not None:
self.end_form() self.end_form()
def start_form(self, attrs): def start_form(self, attrs):
debug("%s", attrs)
if self._current_form is not None: if self._current_form is not None:
raise ParseError("nested FORMs") raise ParseError("nested FORMs")
name = None name = None
...@@ -468,6 +488,7 @@ class _AbstractFormParser: ...@@ -468,6 +488,7 @@ class _AbstractFormParser:
self._current_form = (name, action, method, enctype), d, controls self._current_form = (name, action, method, enctype), d, controls
def end_form(self): def end_form(self):
debug("")
if self._current_label is not None: if self._current_label is not None:
self.end_label() self.end_label()
if self._current_form is None: if self._current_form is None:
...@@ -476,6 +497,7 @@ class _AbstractFormParser: ...@@ -476,6 +497,7 @@ class _AbstractFormParser:
self._current_form = None self._current_form = None
def start_select(self, attrs): def start_select(self, attrs):
debug("%s", attrs)
if self._current_form is None: if self._current_form is None:
raise ParseError("start of SELECT before start of FORM") raise ParseError("start of SELECT before start of FORM")
if self._select is not None: if self._select is not None:
...@@ -492,6 +514,7 @@ class _AbstractFormParser: ...@@ -492,6 +514,7 @@ class _AbstractFormParser:
self._append_select_control({"__select": d}) self._append_select_control({"__select": d})
def end_select(self): def end_select(self):
debug("")
if self._current_form is None: if self._current_form is None:
raise ParseError("end of SELECT before start of FORM") raise ParseError("end of SELECT before start of FORM")
if self._select is None: if self._select is None:
...@@ -503,6 +526,7 @@ class _AbstractFormParser: ...@@ -503,6 +526,7 @@ class _AbstractFormParser:
self._select = None self._select = None
def start_optgroup(self, attrs): def start_optgroup(self, attrs):
debug("%s", attrs)
if self._select is None: if self._select is None:
raise ParseError("OPTGROUP outside of SELECT") raise ParseError("OPTGROUP outside of SELECT")
d = {} d = {}
...@@ -512,11 +536,13 @@ class _AbstractFormParser: ...@@ -512,11 +536,13 @@ class _AbstractFormParser:
self._optgroup = d self._optgroup = d
def end_optgroup(self): def end_optgroup(self):
debug("")
if self._optgroup is None: if self._optgroup is None:
raise ParseError("end of OPTGROUP before start") raise ParseError("end of OPTGROUP before start")
self._optgroup = None self._optgroup = None
def _start_option(self, attrs): def _start_option(self, attrs):
debug("%s", attrs)
if self._select is None: if self._select is None:
raise ParseError("OPTION outside of SELECT") raise ParseError("OPTION outside of SELECT")
if self._option is not None: if self._option is not None:
...@@ -533,6 +559,7 @@ class _AbstractFormParser: ...@@ -533,6 +559,7 @@ class _AbstractFormParser:
self._option["disabled"] = None self._option["disabled"] = None
def _end_option(self): def _end_option(self):
debug("")
if self._option is None: if self._option is None:
raise ParseError("end of OPTION before start") raise ParseError("end of OPTION before start")
...@@ -549,11 +576,13 @@ class _AbstractFormParser: ...@@ -549,11 +576,13 @@ class _AbstractFormParser:
self._option = None self._option = None
def _append_select_control(self, attrs): def _append_select_control(self, attrs):
debug("%s", attrs)
controls = self._current_form[2] controls = self._current_form[2]
name = self._select.get("name") name = self._select.get("name")
controls.append(("select", name, attrs)) controls.append(("select", name, attrs))
def start_textarea(self, attrs): def start_textarea(self, attrs):
debug("%s", attrs)
if self._current_form is None: if self._current_form is None:
raise ParseError("start of TEXTAREA before start of FORM") raise ParseError("start of TEXTAREA before start of FORM")
if self._textarea is not None: if self._textarea is not None:
...@@ -568,6 +597,7 @@ class _AbstractFormParser: ...@@ -568,6 +597,7 @@ class _AbstractFormParser:
self._textarea = d self._textarea = d
def end_textarea(self): def end_textarea(self):
debug("")
if self._current_form is None: if self._current_form is None:
raise ParseError("end of TEXTAREA before start of FORM") raise ParseError("end of TEXTAREA before start of FORM")
if self._textarea is None: if self._textarea is None:
...@@ -578,6 +608,7 @@ class _AbstractFormParser: ...@@ -578,6 +608,7 @@ class _AbstractFormParser:
self._textarea = None self._textarea = None
def start_label(self, attrs): def start_label(self, attrs):
debug("%s", attrs)
if self._current_label: if self._current_label:
self.end_label() self.end_label()
d = {} d = {}
...@@ -591,6 +622,7 @@ class _AbstractFormParser: ...@@ -591,6 +622,7 @@ class _AbstractFormParser:
self._current_label = d self._current_label = d
def end_label(self): def end_label(self):
debug("")
label = self._current_label label = self._current_label
if label is None: if label is None:
# something is ugly in the HTML, but we're ignoring it # something is ugly in the HTML, but we're ignoring it
...@@ -601,6 +633,7 @@ class _AbstractFormParser: ...@@ -601,6 +633,7 @@ class _AbstractFormParser:
del label["__taken"] del label["__taken"]
def _add_label(self, d): def _add_label(self, d):
#debug("%s", d)
if self._current_label is not None: if self._current_label is not None:
if self._current_label["__taken"]: if self._current_label["__taken"]:
self.end_label() # be fuzzy self.end_label() # be fuzzy
...@@ -609,6 +642,7 @@ class _AbstractFormParser: ...@@ -609,6 +642,7 @@ class _AbstractFormParser:
d["__label"] = self._current_label d["__label"] = self._current_label
def handle_data(self, data): def handle_data(self, data):
debug("%s", data)
if self._option is not None: if self._option is not None:
# self._option is a dictionary of the OPTION element's HTML # self._option is a dictionary of the OPTION element's HTML
# attributes, but it has two special keys, one of which is the # attributes, but it has two special keys, one of which is the
...@@ -632,6 +666,7 @@ class _AbstractFormParser: ...@@ -632,6 +666,7 @@ class _AbstractFormParser:
map[key] = map[key] + data map[key] = map[key] + data
def do_button(self, attrs): def do_button(self, attrs):
debug("%s", attrs)
if self._current_form is None: if self._current_form is None:
raise ParseError("start of BUTTON before start of FORM") raise ParseError("start of BUTTON before start of FORM")
d = {} d = {}
...@@ -651,6 +686,7 @@ class _AbstractFormParser: ...@@ -651,6 +686,7 @@ class _AbstractFormParser:
controls.append((type, name, d)) controls.append((type, name, d))
def do_input(self, attrs): def do_input(self, attrs):
debug("%s", attrs)
if self._current_form is None: if self._current_form is None:
raise ParseError("start of INPUT before start of FORM") raise ParseError("start of INPUT before start of FORM")
d = {} d = {}
...@@ -665,6 +701,7 @@ class _AbstractFormParser: ...@@ -665,6 +701,7 @@ class _AbstractFormParser:
controls.append((type, name, d)) controls.append((type, name, d))
def do_isindex(self, attrs): def do_isindex(self, attrs):
debug("%s", attrs)
if self._current_form is None: if self._current_form is None:
raise ParseError("start of ISINDEX before start of FORM") raise ParseError("start of ISINDEX before start of FORM")
d = {} d = {}
...@@ -677,18 +714,20 @@ class _AbstractFormParser: ...@@ -677,18 +714,20 @@ class _AbstractFormParser:
controls.append(("isindex", None, d)) controls.append(("isindex", None, d))
def handle_entityref(self, name): def handle_entityref(self, name):
table = self._entitydefs #debug("%s", name)
fullname = "&%s;" % name self.handle_data(unescape(
if table.has_key(fullname): '&%s;' % name, self._entitydefs, self._encoding))
self.handle_data(table[fullname])
else: def handle_charref(self, name):
self.unknown_entityref(name) #debug("%s", name)
return self.handle_data(unescape_charref(name, self._encoding))
def unescape_attr(self, name): def unescape_attr(self, name):
return unescape(name, self._entitydefs) #debug("%s", name)
return unescape(name, self._entitydefs, self._encoding)
def unescape_attrs(self, attrs): def unescape_attrs(self, attrs):
#debug("%s", attrs)
escaped_attrs = {} escaped_attrs = {}
for key, val in attrs.items(): for key, val in attrs.items():
try: try:
...@@ -710,15 +749,15 @@ try: ...@@ -710,15 +749,15 @@ try:
import HTMLParser import HTMLParser
except ImportError: except ImportError:
class XHTMLCompatibleFormParser: class XHTMLCompatibleFormParser:
def __init__(self, entitydefs=None): def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
raise ValueError("HTMLParser could not be imported") raise ValueError("HTMLParser could not be imported")
else: else:
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
"""Good for XHTML, bad for tolerance of incorrect HTML.""" """Good for XHTML, bad for tolerance of incorrect HTML."""
# thanks to Michael Howitz for this! # thanks to Michael Howitz for this!
def __init__(self, entitydefs=None): def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
HTMLParser.HTMLParser.__init__(self) HTMLParser.HTMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs) _AbstractFormParser.__init__(self, entitydefs, encoding)
def start_option(self, attrs): def start_option(self, attrs):
_AbstractFormParser._start_option(self, attrs) _AbstractFormParser._start_option(self, attrs)
...@@ -747,18 +786,6 @@ else: ...@@ -747,18 +786,6 @@ else:
else: else:
method() method()
# taken from sgmllib, with changes
def handle_charref(self, name):
try:
n = int(name)
except ValueError:
self.unknown_charref(name)
return
if not 0 <= n <= 255:
self.unknown_charref(name)
return
self.handle_data(chr(n))
def unescape(self, name): def unescape(self, name):
# Use the entitydefs passed into constructor, not # Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs. # HTMLParser.HTMLParser's entitydefs.
...@@ -769,13 +796,10 @@ else: ...@@ -769,13 +796,10 @@ else:
def unescape_attrs_if_required(self, attrs): def unescape_attrs_if_required(self, attrs):
return attrs # ditto return attrs # ditto
import htmllib, formatter import sgmllib
class FormParser(_AbstractFormParser, htmllib.HTMLParser): # monkeypatch to fix http://www.python.org/sf/803422 :-(
"""Good for tolerance of incorrect HTML, bad for XHTML.""" sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
def __init__(self, entitydefs=None): class _AbstractSgmllibParser(_AbstractFormParser):
htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
_AbstractFormParser.__init__(self, entitydefs)
def do_option(self, attrs): def do_option(self, attrs):
_AbstractFormParser._start_option(self, attrs) _AbstractFormParser._start_option(self, attrs)
...@@ -784,19 +808,52 @@ class FormParser(_AbstractFormParser, htmllib.HTMLParser): ...@@ -784,19 +808,52 @@ class FormParser(_AbstractFormParser, htmllib.HTMLParser):
def unescape_attrs_if_required(self, attrs): def unescape_attrs_if_required(self, attrs):
return self.unescape_attrs(attrs) return self.unescape_attrs(attrs)
#FormParser = XHTMLCompatibleFormParser # testing hack class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
"""Good for tolerance of incorrect HTML, bad for XHTML."""
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
sgmllib.SGMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs, encoding)
def get_entitydefs(): try:
entitydefs = {} if sys.version_info[:2] < (2, 2):
for name, char in htmlentitydefs.entitydefs.items(): raise ImportError # BeautifulSoup uses generators
entitydefs["&%s;" % name] = char import BeautifulSoup
return entitydefs except ImportError:
pass
else:
class _AbstractBSFormParser(_AbstractSgmllibParser):
bs_base_class = None
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
_AbstractFormParser.__init__(self, entitydefs, encoding)
self.bs_base_class.__init__(self)
def handle_data(self, data):
_AbstractFormParser.handle_data(self, data)
self.bs_base_class.handle_data(self, data)
class RobustFormParser(_AbstractBSFormParser, BeautifulSoup.BeautifulSoup):
"""Tries to be highly tolerant of incorrect HTML."""
bs_base_class = BeautifulSoup.BeautifulSoup
class NestingRobustFormParser(_AbstractBSFormParser,
BeautifulSoup.ICantBelieveItsBeautifulSoup):
"""Tries to be highly tolerant of incorrect HTML.
Different from RobustFormParser in that it more often guesses nesting
above missing end tags (see BeautifulSoup docs).
"""
bs_base_class = BeautifulSoup.ICantBelieveItsBeautifulSoup
#FormParser = XHTMLCompatibleFormParser # testing hack
#FormParser = RobustFormParser # testing hack
def ParseResponse(response, select_default=False, def ParseResponse(response, select_default=False,
ignore_errors=False, # ignored! ignore_errors=False, # ignored!
form_parser_class=FormParser, form_parser_class=FormParser,
request_class=urllib2.Request, request_class=urllib2.Request,
entitydefs=None, backwards_compat=True): entitydefs=None,
backwards_compat=True,
encoding=DEFAULT_ENCODING,
):
"""Parse HTTP response and return a list of HTMLForm instances. """Parse HTTP response and return a list of HTMLForm instances.
The return value of urllib2.urlopen can be conveniently passed to this The return value of urllib2.urlopen can be conveniently passed to this
...@@ -811,11 +868,17 @@ def ParseResponse(response, select_default=False, ...@@ -811,11 +868,17 @@ def ParseResponse(response, select_default=False,
form_parser_class: class to instantiate and use to pass form_parser_class: class to instantiate and use to pass
request_class: class to return from .click() method (default is request_class: class to return from .click() method (default is
urllib2.Request) urllib2.Request)
entitydefs: mapping like {'&amp;': '&', ...} containing HTML entity entitydefs: mapping like {"&amp;": "&", ...} containing HTML entity
definitions (a sensible default is used) definitions (a sensible default is used)
encoding: character encoding used for encoding numeric character references
when matching link text. ClientForm does not attempt to find the encoding
in a META HTTP-EQUIV attribute in the document itself (mechanize, for
example, does do that and will pass the correct value to ClientForm using
this parameter).
backwards_compat: boolean that determines whether the returned HTMLForm backwards_compat: boolean that determines whether the returned HTMLForm
objects are backwards-compatible with old code. If backwards_compat is True: objects are backwards-compatible with old code. If backwards_compat is
true:
- ClientForm 0.1 code will continue to work as before. - ClientForm 0.1 code will continue to work as before.
...@@ -844,7 +907,7 @@ def ParseResponse(response, select_default=False, ...@@ -844,7 +907,7 @@ def ParseResponse(response, select_default=False,
There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
htmllib.HTMLParser) (the default) works best for ordinary grubby HTML. sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
Note that HTMLParser is only available in Python 2.2 and later. You can Note that HTMLParser is only available in Python 2.2 and later. You can
pass your own class in here as a hack to work around bad HTML, but at your pass your own class in here as a hack to work around bad HTML, but at your
own risk: there is no well-defined interface. own risk: there is no well-defined interface.
...@@ -854,13 +917,19 @@ def ParseResponse(response, select_default=False, ...@@ -854,13 +917,19 @@ def ParseResponse(response, select_default=False,
False, False,
form_parser_class, form_parser_class,
request_class, request_class,
entitydefs, backwards_compat) entitydefs,
backwards_compat,
encoding,
)
def ParseFile(file, base_uri, select_default=False, def ParseFile(file, base_uri, select_default=False,
ignore_errors=False, # ignored! ignore_errors=False, # ignored!
form_parser_class=FormParser, form_parser_class=FormParser,
request_class=urllib2.Request, request_class=urllib2.Request,
entitydefs=None, backwards_compat=True): entitydefs=None,
backwards_compat=True,
encoding=DEFAULT_ENCODING,
):
"""Parse HTML and return a list of HTMLForm instances. """Parse HTML and return a list of HTMLForm instances.
ClientForm.ParseError is raised on parse errors. ClientForm.ParseError is raised on parse errors.
...@@ -876,7 +945,7 @@ def ParseFile(file, base_uri, select_default=False, ...@@ -876,7 +945,7 @@ def ParseFile(file, base_uri, select_default=False,
""" """
if backwards_compat: if backwards_compat:
deprecation("operating in backwards-compatibility mode") deprecation("operating in backwards-compatibility mode")
fp = form_parser_class(entitydefs) fp = form_parser_class(entitydefs, encoding)
while 1: while 1:
data = file.read(CHUNK) data = file.read(CHUNK)
try: try:
...@@ -916,8 +985,9 @@ def ParseFile(file, base_uri, select_default=False, ...@@ -916,8 +985,9 @@ def ParseFile(file, base_uri, select_default=False,
type, name, attrs = controls[ii] type, name, attrs = controls[ii]
attrs = fp.unescape_attrs_if_required(attrs) attrs = fp.unescape_attrs_if_required(attrs)
name = fp.unescape_attr_if_required(name) name = fp.unescape_attr_if_required(name)
# index=ii*10 allows ImageControl to return multiple ordered pairs
form.new_control(type, name, attrs, select_default=select_default, form.new_control(type, name, attrs, select_default=select_default,
index=ii) index=ii*10)
forms.append(form) forms.append(form)
for form in forms: for form in forms:
form.fixup() form.fixup()
...@@ -930,7 +1000,7 @@ class Label: ...@@ -930,7 +1000,7 @@ class Label:
self._text = attrs.get("__text").strip() self._text = attrs.get("__text").strip()
self._ctext = compress_text(self._text) self._ctext = compress_text(self._text)
self.attrs = attrs self.attrs = attrs
self._backwards_compat = False # maintaned by HTMLForm self._backwards_compat = False # maintained by HTMLForm
def __getattr__(self, name): def __getattr__(self, name):
if name == "text": if name == "text":
...@@ -942,15 +1012,15 @@ class Label: ...@@ -942,15 +1012,15 @@ class Label:
def __setattr__(self, name, value): def __setattr__(self, name, value):
if name == "text": if name == "text":
# don't see any need for this # don't see any need for this, so make it read-only
raise AttributeError("text attribute is read-only") raise AttributeError("text attribute is read-only")
self.__dict__[name] = value self.__dict__[name] = value
def __str__(self): def __str__(self):
return '<Label(id=%r, text=%r)>' % (self.id, self.text) return "<Label(id=%r, text=%r)>" % (self.id, self.text)
def _getLabel(attrs): def _get_label(attrs):
text = attrs.get("__label") text = attrs.get("__label")
if text is not None: if text is not None:
return Label(text) return Label(text)
...@@ -1049,15 +1119,14 @@ class Control: ...@@ -1049,15 +1119,14 @@ class Control:
""" """
raise NotImplementedError() raise NotImplementedError()
def _write_mime_data(self, mw): def _write_mime_data(self, mw, name, value):
"""Write data for this control to a MimeWriter.""" """Write data for a subitem of this control to a MimeWriter."""
# called by HTMLForm # called by HTMLForm
for name, value in self.pairs(): mw2 = mw.nextpart()
mw2 = mw.nextpart() mw2.addheader("Content-disposition",
mw2.addheader("Content-disposition", 'form-data; name="%s"' % name, 1)
'form-data; name="%s"' % name, 1) f = mw2.startbody(prefix=0)
f = mw2.startbody(prefix=0) f.write(value)
f.write(value)
def __str__(self): def __str__(self):
raise NotImplementedError() raise NotImplementedError()
...@@ -1093,7 +1162,7 @@ class ScalarControl(Control): ...@@ -1093,7 +1162,7 @@ class ScalarControl(Control):
""" """
def __init__(self, type, name, attrs, index=None): def __init__(self, type, name, attrs, index=None):
self._index = index self._index = index
self._label = _getLabel(attrs) self._label = _get_label(attrs)
self.__dict__["type"] = type.lower() self.__dict__["type"] = type.lower()
self.__dict__["name"] = name self.__dict__["name"] = name
self._value = attrs.get("value") self._value = attrs.get("value")
...@@ -1161,7 +1230,6 @@ class TextControl(ScalarControl): ...@@ -1161,7 +1230,6 @@ class TextControl(ScalarControl):
INPUT/TEXT INPUT/TEXT
INPUT/PASSWORD INPUT/PASSWORD
INPUT/FILE
INPUT/HIDDEN INPUT/HIDDEN
TEXTAREA TEXTAREA
...@@ -1219,8 +1287,9 @@ class FileControl(ScalarControl): ...@@ -1219,8 +1287,9 @@ class FileControl(ScalarControl):
return [] return []
return [(self._index, self.name, "")] return [(self._index, self.name, "")]
def _write_mime_data(self, mw): def _write_mime_data(self, mw, _name, _value):
# called by HTMLForm # called by HTMLForm
# assert _name == self.name and _value == ''
if len(self._upload_data) == 1: if len(self._upload_data) == 1:
# single file # single file
file_object, content_type, filename = self._upload_data[0] file_object, content_type, filename = self._upload_data[0]
...@@ -1381,7 +1450,7 @@ class IgnoreControl(ScalarControl): ...@@ -1381,7 +1450,7 @@ class IgnoreControl(ScalarControl):
class Item: class Item:
def __init__(self, control, attrs, index=None): def __init__(self, control, attrs, index=None):
label = _getLabel(attrs) label = _get_label(attrs)
self.__dict__.update({ self.__dict__.update({
"name": attrs["value"], "name": attrs["value"],
"_labels": label and [label] or [], "_labels": label and [label] or [],
...@@ -1793,7 +1862,7 @@ class ListControl(Control): ...@@ -1793,7 +1862,7 @@ class ListControl(Control):
def merge_control(self, control): def merge_control(self, control):
assert bool(control.multiple) == bool(self.multiple) assert bool(control.multiple) == bool(self.multiple)
#assert isinstance(control, self.__class__) # usually, isinstance(control, self.__class__)
self.items.extend(control.items) self.items.extend(control.items)
def fixup(self): def fixup(self):
...@@ -2084,6 +2153,12 @@ class SelectControl(ListControl): ...@@ -2084,6 +2153,12 @@ class SelectControl(ListControl):
SELECT (and OPTION) SELECT (and OPTION)
OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
SELECT control values and labels are subject to some messy defaulting SELECT control values and labels are subject to some messy defaulting
rules. For example, if the HTML representation of the control is: rules. For example, if the HTML representation of the control is:
...@@ -2094,9 +2169,9 @@ class SelectControl(ListControl): ...@@ -2094,9 +2169,9 @@ class SelectControl(ListControl):
</SELECT> </SELECT>
The items, in order, have labels "2002", "2001" and "2000", whereas their The items, in order, have labels "2002", "2001" and "2000", whereas their
values are "0", "1" and "2000" respectively. Note that the value of the names (the OPTION values) are "0", "1" and "2000" respectively. Note that
last OPTION in this example defaults to its contents, as specified by RFC the value of the last OPTION in this example defaults to its contents, as
1866, as do the labels of the second and third OPTIONs. specified by RFC 1866, as do the labels of the second and third OPTIONs.
The OPTION labels are sometimes more meaningful than the OPTION values, The OPTION labels are sometimes more meaningful than the OPTION values,
which can make for more maintainable code. which can make for more maintainable code.
...@@ -2106,14 +2181,13 @@ class SelectControl(ListControl): ...@@ -2106,14 +2181,13 @@ class SelectControl(ListControl):
The attrs attribute is a dictionary of the original HTML attributes of the The attrs attribute is a dictionary of the original HTML attributes of the
SELECT element. Other ListControls do not have this attribute, because in SELECT element. Other ListControls do not have this attribute, because in
other cases the control as a whole does not correspond to any single HTML other cases the control as a whole does not correspond to any single HTML
element. The get_item_attrs method may be used as usual to get at the element. control.get(...).attrs may be used as usual to get at the HTML
HTML attributes of the HTML elements corresponding to individual list items attributes of the HTML elements corresponding to individual list items (for
(for SELECT controls, these are OPTION elements). SELECT controls, these are OPTION elements).
Another special case is that the attributes dictionaries returned by Another special case is that the Item.attrs dictionaries have a special key
get_item_attrs have a special key "contents" which does not correspond to "contents" which does not correspond to any real HTML attribute, but rather
any real HTML attribute, but rather contains the contents of the OPTION contains the contents of the OPTION element:
element:
<OPTION>this bit</OPTION> <OPTION>this bit</OPTION>
...@@ -2136,7 +2210,7 @@ class SelectControl(ListControl): ...@@ -2136,7 +2210,7 @@ class SelectControl(ListControl):
# fish out the SELECT HTML attributes from the OPTION HTML attributes # fish out the SELECT HTML attributes from the OPTION HTML attributes
# dictionary # dictionary
self.attrs = attrs["__select"].copy() self.attrs = attrs["__select"].copy()
self.__dict__["_label"] = _getLabel(self.attrs) self.__dict__["_label"] = _get_label(self.attrs)
self.__dict__["id"] = self.attrs.get("id") self.__dict__["id"] = self.attrs.get("id")
self.__dict__["multiple"] = self.attrs.has_key("multiple") self.__dict__["multiple"] = self.attrs.has_key("multiple")
# the majority of the contents, label, and value dance already happened # the majority of the contents, label, and value dance already happened
...@@ -2169,14 +2243,19 @@ class SelectControl(ListControl): ...@@ -2169,14 +2243,19 @@ class SelectControl(ListControl):
def fixup(self): def fixup(self):
ListControl.fixup(self) ListControl.fixup(self)
# Firefox doesn't exclude disabled items from those considered here # Firefox doesn't exclude disabled items from those considered here
# (i.e. from 'found', for both brances of the if below). Note that IE # (i.e. from 'found', for both branches of the if below). Note that
# doesn't support the disabled attribute on OPTIONs at all. # IE6 doesn't support the disabled attribute on OPTIONs at all.
found = [o for o in self.items if o.selected] found = [o for o in self.items if o.selected]
if not found: if not found:
if not self.multiple or self._select_default: if not self.multiple or self._select_default:
for o in self.items: for o in self.items:
if not o.disabled: if not o.disabled:
o.selected = True was_disabled = self.disabled
self.disabled = False
try:
o.selected = True
finally:
o.disabled = was_disabled
break break
elif not self.multiple: elif not self.multiple:
# Ensure only one item selected. Choose the last one, # Ensure only one item selected. Choose the last one,
...@@ -2245,11 +2324,11 @@ class ImageControl(SubmitControl): ...@@ -2245,11 +2324,11 @@ class ImageControl(SubmitControl):
if name is None: return [] if name is None: return []
pairs = [ pairs = [
(self._index, "%s.x" % name, str(clicked[0])), (self._index, "%s.x" % name, str(clicked[0])),
(self._index, "%s.y" % name, str(clicked[1])), (self._index+1, "%s.y" % name, str(clicked[1])),
] ]
value = self._value value = self._value
if value: if value:
pairs.append((self._index, name, value)) pairs.append((self._index+2, name, value))
return pairs return pairs
get_labels = ScalarControl.get_labels get_labels = ScalarControl.get_labels
...@@ -2301,8 +2380,10 @@ class HTMLForm: ...@@ -2301,8 +2380,10 @@ class HTMLForm:
need to be more specific than just supplying the control's name, use the need to be more specific than just supplying the control's name, use the
set_value and get_value methods. set_value and get_value methods.
ListControl values are lists of item names. The list item's name is the ListControl values are lists of item names (specifically, the names of the
value of the corresponding HTML element's "value" attribute. items that are selected and not disabled, and hence are "successful" -- ie.
cause data to be returned to the server). The list item's name is the
value of the corresponding HTML element's"value" attribute.
Example: Example:
...@@ -2321,11 +2402,12 @@ class HTMLForm: ...@@ -2321,11 +2402,12 @@ class HTMLForm:
defines a SELECT control with name "more_cheeses" which has two items, defines a SELECT control with name "more_cheeses" which has two items,
named "1" and "2" (because the OPTION element's value HTML attribute named "1" and "2" (because the OPTION element's value HTML attribute
defaults to the element contents). defaults to the element contents -- see SelectControl.__doc__ for more on
these defaulting rules).
To select, deselect or otherwise manipulate individual list items, use the To select, deselect or otherwise manipulate individual list items, use the
HTMLForm.find_control() and ListControl.get() methods. To set the whole HTMLForm.find_control() and ListControl.get() methods. To set the whole
value, do as for any other control:use indexing or the set_/get_value value, do as for any other control: use indexing or the set_/get_value
methods. methods.
Example: Example:
...@@ -2611,7 +2693,9 @@ class HTMLForm: ...@@ -2611,7 +2693,9 @@ class HTMLForm:
#--------------------------------------------------- #---------------------------------------------------
def __str__(self): def __str__(self):
header = "%s %s %s" % (self.method, self.action, self.enctype) header = "%s%s %s %s" % (
(self.name and self.name+" " or ""),
self.method, self.action, self.enctype)
rep = [header] rep = [header]
for control in self.controls: for control in self.controls:
rep.append(" %s" % str(control)) rep.append(" %s" % str(control))
...@@ -3054,17 +3138,23 @@ class HTMLForm: ...@@ -3054,17 +3138,23 @@ class HTMLForm:
def _pairs(self): def _pairs(self):
"""Return sequence of (key, value) pairs suitable for urlencoding.""" """Return sequence of (key, value) pairs suitable for urlencoding."""
opairs = [] return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]
for control in self.controls:
opairs.extend(control._totally_ordered_pairs())
def _pairs_and_controls(self):
"""Return sequence of (index, key, value, control_index)
of totally ordered pairs suitable for urlencoding.
control_index is the index of the control in self.controls
"""
pairs = []
for control_index in range(len(self.controls)):
control = self.controls[control_index]
for ii, key, val in control._totally_ordered_pairs():
pairs.append((ii, key, val, control_index))
# stable sort by ONLY first item in tuple # stable sort by ONLY first item in tuple
sorter = [] pairs.sort()
for jj in range(len(opairs)):
ii, key, val = opairs[jj]
sorter.append((ii, jj, key, val))
sorter.sort()
pairs = [(key, val) for (ii, jj, key, val) in sorter]
return pairs return pairs
...@@ -3094,8 +3184,8 @@ class HTMLForm: ...@@ -3094,8 +3184,8 @@ class HTMLForm:
mw = MimeWriter(data, http_hdrs) mw = MimeWriter(data, http_hdrs)
f = mw.startmultipartbody("form-data", add_to_http_hdrs=True, f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
prefix=0) prefix=0)
for control in self.controls: for ii, k, v, control_index in self._pairs_and_controls():
control._write_mime_data(mw) self.controls[control_index]._write_mime_data(mw, k, v)
mw.lastpart() mw.lastpart()
return uri, data.getvalue(), http_hdrs return uri, data.getvalue(), http_hdrs
else: else:
...@@ -3116,7 +3206,7 @@ class HTMLForm: ...@@ -3116,7 +3206,7 @@ class HTMLForm:
req = request_class(req_data[0], req_data[1]) req = request_class(req_data[0], req_data[1])
for key, val in req_data[2]: for key, val in req_data[2]:
add_hdr = req.add_header add_hdr = req.add_header
if key.lower() == 'content-type': if key.lower() == "content-type":
try: try:
add_hdr = req.add_unredirected_header add_hdr = req.add_unredirected_header
except AttributeError: except AttributeError:
......
"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
Examples
This program extracts all links from a document. It will print one
line for each link, containing the URL and the textual description
between the <A>...</A> tags:
import pullparser, sys
f = file(sys.argv[1])
p = pullparser.PullParser(f)
for token in p.tags("a"):
if token.type == "endtag": continue
url = dict(token.attrs).get("href", "-")
text = p.get_compressed_text(endat=("endtag", "a"))
print "%s\t%s" % (url, text)
This program extracts the <TITLE> from the document:
import pullparser, sys
f = file(sys.argv[1])
p = pullparser.PullParser(f)
if p.get_tag("title"):
title = p.get_compressed_text()
print "Title: %s" % title
Copyright 2003-2004 John J. Lee <jjl@pobox.com>
Copyright 1998-2001 Gisle Aas (original libwww-perl code)
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License.
"""
from __future__ import generators
import re, htmlentitydefs
import HTMLParser
__version__ = (0, 0, 6, None, None) # 0.0.6b
class NoMoreTokensError(Exception): pass
class Token:
"""Represents an HTML tag, declaration, processing instruction etc.
Behaves as both a tuple-like object (ie. iterable) and has attributes
.type, .data and .attrs.
>>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
>>> t == ("starttag", "a", [("href", "http://www.python.org/")])
True
>>> t.type, t.data == "starttag", "a"
True
>>> t.attrs == [("href", "http://www.python.org/")]
True
Public attributes
type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
"data", "comment", "decl", "pi", after the corresponding methods of
HTMLParser.HTMLParser
data: For a tag, the tag name; otherwise, the relevant data carried by the
tag, as a string
attrs: list of (name, value) pairs representing HTML attributes
(or None if token does not represent an opening tag)
"""
def __init__(self, type, data, attrs=None):
self.type = type
self.data = data
self.attrs = attrs
def __iter__(self):
return iter((self.type, self.data, self.attrs))
def __eq__(self, other):
type, data, attrs = other
if (self.type == type and
self.data == data and
self.attrs == attrs):
return True
else:
return False
def __ne__(self, other): return not self.__eq__(other)
def __repr__(self):
args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
return self.__class__.__name__+"(%s)" % args
def iter_until_exception(fn, exception, *args, **kwds):
while 1:
try:
yield fn(*args, **kwds)
except exception:
raise StopIteration
def caller():
try:
raise SyntaxError
except:
import sys
return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
def unescape(data, entities):
if data is None or '&' not in data:
return data
def replace_entities(match):
ent = match.group()
repl = entities.get(ent, ent)
return repl
return re.sub(r'&\S+?;', replace_entities, data)
def get_entitydefs():
entitydefs = {}
for name, char in htmlentitydefs.entitydefs.items():
entitydefs["&%s;" % name] = char
return entitydefs
class _AbstractParser:
chunk = 1024
compress_re = re.compile(r"\s+")
entitydefs = htmlentitydefs.entitydefs
def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
encoding="ascii", entitydefs=None):
"""
fh: file-like object (only a .read() method is required) from which to
read HTML to be parsed
textify: mapping used by .get_text() and .get_compressed_text() methods
to represent opening tags as text
encoding: encoding used to encode numeric character references by
.get_text() and .get_compressed_text() ("ascii" by default)
entitydefs: mapping like {'&amp;': '&', ...} containing HTML entity
definitions (a sensible default is used)
If the element name of an opening tag matches a key in the textify
mapping then that tag is converted to text. The corresponding value is
used to specify which tag attribute to obtain the text from. textify
maps from element names to either:
- an HTML attribute name, in which case the HTML attribute value is
used as its text value along with the element name in square
brackets (eg."alt text goes here[IMG]", or, if the alt attribute
were missing, just "[IMG]")
- a callable object (eg. a function) which takes a Token and returns
the string to be used as its text value
If textify has no key for an element name, nothing is substituted for
the opening tag.
Public attributes:
encoding and textify: see above
"""
self._fh = fh
self._tokenstack = [] # FIFO
self.textify = textify
self.encoding = encoding
if entitydefs is None:
entitydefs = get_entitydefs()
self._entitydefs = entitydefs
def __iter__(self): return self
def tags(self, *names):
return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
def tokens(self, *tokentypes):
return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
def next(self):
try:
return self.get_token()
except NoMoreTokensError:
raise StopIteration()
def get_token(self, *tokentypes):
"""Pop the next Token object from the stack of parsed tokens.
If arguments are given, they are taken to be token types in which the
caller is interested: tokens representing other elements will be
skipped. Element names must be given in lower case.
Raises NoMoreTokensError.
"""
while 1:
while self._tokenstack:
token = self._tokenstack.pop(0)
if tokentypes:
if token.type in tokentypes:
return token
else:
return token
data = self._fh.read(self.chunk)
if not data:
raise NoMoreTokensError()
self.feed(data)
def unget_token(self, token):
"""Push a Token back onto the stack."""
self._tokenstack.insert(0, token)
def get_tag(self, *names):
"""Return the next Token that represents an opening or closing tag.
If arguments are given, they are taken to be element names in which the
caller is interested: tags representing other elements will be skipped.
Element names must be given in lower case.
Raises NoMoreTokensError.
"""
while 1:
tok = self.get_token()
if tok.type not in ["starttag", "endtag", "startendtag"]:
continue
if names:
if tok.data in names:
return tok
else:
return tok
def get_text(self, endat=None):
"""Get some text.
endat: stop reading text at this tag (the tag is included in the
returned text); endtag is a tuple (type, name) where type is
"starttag", "endtag" or "startendtag", and name is the element name of
the tag (element names must be given in lower case)
If endat is not given, .get_text() will stop at the next opening or
closing tag, or when there are no more tokens (no exception is raised).
Note that .get_text() includes the text representation (if any) of the
opening tag, but pushes the opening tag back onto the stack. As a
result, if you want to call .get_text() again, you need to call
.get_tag() first (unless you want an empty string returned when you
next call .get_text()).
Entity references are translated using the entitydefs attribute (a
mapping from names to characters like that provided by the standard
module htmlentitydefs). Named entity references that are not in this
mapping are left unchanged.
The textify attribute is used to translate opening tags into text: see
the class docstring.
"""
text = []
tok = None
while 1:
try:
tok = self.get_token()
except NoMoreTokensError:
# unget last token (not the one we just failed to get)
if tok: self.unget_token(tok)
break
if tok.type == "data":
text.append(tok.data)
elif tok.type == "entityref":
name = tok.data
if name in self.entitydefs:
t = self.entitydefs[name]
else:
t = "&%s;" % name
text.append(t)
elif tok.type == "charref":
name, base = tok.data, 10
if name.startswith('x'):
name, base= name[1:], 16
t = unichr(int(name, base)).encode(self.encoding)
text.append(t)
elif tok.type in ["starttag", "endtag", "startendtag"]:
tag_name = tok.data
if tok.type in ["starttag", "startendtag"]:
alt = self.textify.get(tag_name)
if alt is not None:
if callable(alt):
text.append(alt(tok))
elif tok.attrs is not None:
for k, v in tok.attrs:
if k == alt:
text.append(v)
text.append("[%s]" % tag_name.upper())
if endat is None or endat == (tok.type, tag_name):
self.unget_token(tok)
break
return "".join(text)
def get_compressed_text(self, *args, **kwds):
"""
As .get_text(), but collapses each group of contiguous whitespace to a
single space character, and removes all initial and trailing
whitespace.
"""
text = self.get_text(*args, **kwds)
text = text.strip()
return self.compress_re.sub(" ", text)
def handle_startendtag(self, tag, attrs):
self._tokenstack.append(Token("startendtag", tag, attrs))
def handle_starttag(self, tag, attrs):
self._tokenstack.append(Token("starttag", tag, attrs))
def handle_endtag(self, tag):
self._tokenstack.append(Token("endtag", tag))
def handle_charref(self, name):
self._tokenstack.append(Token("charref", name))
def handle_entityref(self, name):
self._tokenstack.append(Token("entityref", name))
def handle_data(self, data):
self._tokenstack.append(Token("data", data))
def handle_comment(self, data):
self._tokenstack.append(Token("comment", data))
def handle_decl(self, decl):
self._tokenstack.append(Token("decl", decl))
def unknown_decl(self, data):
# XXX should this call self.error instead?
#self.error("unknown declaration: " + `data`)
self._tokenstack.append(Token("decl", data))
def handle_pi(self, data):
self._tokenstack.append(Token("pi", data))
def unescape_attr(self, name):
return unescape(name, self._entitydefs)
def unescape_attrs(self, attrs):
escaped_attrs = []
for key, val in attrs:
escaped_attrs.append((key, self.unescape_attr(val)))
return escaped_attrs
class PullParser(_AbstractParser, HTMLParser.HTMLParser):
def __init__(self, *args, **kwds):
HTMLParser.HTMLParser.__init__(self)
_AbstractParser.__init__(self, *args, **kwds)
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
return self.unescape_attr(name)
import sgmllib
class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
def __init__(self, *args, **kwds):
sgmllib.SGMLParser.__init__(self)
_AbstractParser.__init__(self, *args, **kwds)
def unknown_starttag(self, tag, attrs):
attrs = self.unescape_attrs(attrs)
self._tokenstack.append(Token("starttag", tag, attrs))
def unknown_endtag(self, tag):
self._tokenstack.append(Token("endtag", tag))
annotation -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/annotation annotation -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/annotation
apidoc -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/apidoc apidoc -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/apidoc
applicationcontrol -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/applicationcontrol applicationcontrol -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/applicationcontrol
appsetup -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/appsetup appsetup -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/appsetup
authentication -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/authentication authentication -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/authentication
basicskin -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/basicskin basicskin -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/basicskin
broken -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/broken broken -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/broken
cache -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/cache cache -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/cache
component -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/component component -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/component
container -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/container container -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/container
content -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/content content -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/content
content_types -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/content_types content_types -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/content_types
copypastemove -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/copypastemove copypastemove -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/copypastemove
datetimeutils -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/datetimeutils datetimeutils -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/datetimeutils
debug -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/debug debug -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/debug
decorator -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/decorator decorator -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/decorator
dependable -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/dependable dependable -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/dependable
dtmlpage -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/dtmlpage dtmlpage -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/dtmlpage
dublincore -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/dublincore dublincore -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/dublincore
error -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/error error -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/error
event -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/event event -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/event
exception -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/exception exception -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/exception
file -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/file file -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/file
filerepresentation -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/filerepresentation filerepresentation -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/filerepresentation
folder -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/folder folder -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/folder
form -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/form form -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/form
ftests -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/ftests ftests -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/ftests
generations -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/generations generations -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/generations
http -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/http http -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/http
i18n -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/i18n i18n -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/i18n
interface -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/interface interface -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/interface
intid -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/intid intid -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/intid
introspector -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/introspector introspector -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/introspector
keyreference -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/keyreference keyreference -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/keyreference
layers -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/layers layers -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/layers
locales -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/locales locales -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/locales
location -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/location location -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/location
mail -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/mail mail -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/mail
onlinehelp -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/onlinehelp onlinehelp -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/onlinehelp
pagetemplate -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/pagetemplate pagetemplate -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/pagetemplate
preference -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/preference preference -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/preference
preview -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/preview preview -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/preview
principalannotation -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/principalannotation principalannotation -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/principalannotation
publication -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/publication publication -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/publication
publisher -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/publisher publisher -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/publisher
rdb -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/rdb rdb -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/rdb
renderer -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/renderer renderer -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/renderer
rotterdam -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/rotterdam rotterdam -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/rotterdam
schema -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/schema schema -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/schema
security -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/security security -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/security
servicenames -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/servicenames servicenames -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/servicenames
session -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/session session -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/session
site -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/site site -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/site
size -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/size size -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/size
skins -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/skins skins -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/skins
sqlscript -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/sqlscript sqlscript -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/sqlscript
testing -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/testing testing -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/testing
tests -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/tests tests -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/tests
timezones -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/timezones timezones -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/timezones
traversing -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/traversing traversing -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/traversing
tree -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/tree tree -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/tree
undo -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/undo undo -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/undo
wfmc -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/wfmc wfmc -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/wfmc
wsgi -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/wsgi wsgi -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/wsgi
zapi -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/zapi zapi -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/zapi
zopeappgenerations -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/zopeappgenerations zopeappgenerations -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/zopeappgenerations
zptpage -r 68988 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/zptpage zptpage -r 69022 svn://svn.zope.org/repos/main/Zope3/branches/3.3/src/zope/app/zptpage
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment