# A parser for SGML, using the derived class as static DTD.
# XXX This only supports those SGML features used by HTML.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
# sgmlop support added by fredrik@pythonware.com (April 6, 1998)
import re
import string
try:
import sgmlop
except ImportError:
sgmlop = None
# standard entity defs
ENTITYDEFS = {
'lt': '<',
'gt': '>',
'amp': '&',
'quot': '"',
'apos': '\''
}
# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# and , respectively, or do_foo to handle by itself.
# (Tags are converted to lower case for this purpose.) The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbutrary
# chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.
# --------------------------------------------------------------------
# original re-based SGML parser
interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
'<([a-zA-Z][^<>]*|'
'/([a-zA-Z][^<>]*)?|'
'![^<>]*)?')
entityref = re.compile('&([a-zA-Z][a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('([0-9]+)[^0-9]')
starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][a-zA-Z0-9]*)/([^/]*)/')
endtagopen = re.compile('[<>a-zA-Z]')
endbracket = re.compile('[<>]')
special = re.compile(']*>')
commentopen = re.compile('