"""
SAX2 driver for parsing HTML with the sgmlop parser.
$Id: drv_sgmlop_html.py,v 1.3 2002/05/10 14:50:06 akuchling Exp $
"""
version = "0.1"
from drv_sgmlop import *
from xml.dom.html import HTML_CHARACTER_ENTITIES, HTML_FORBIDDEN_END, HTML_OPT_END, HTML_DTD
from string import strip, upper
class SaxHtmlParser(SaxParser):
def __init__(self, bufsize = 65536, encoding = 'iso-8859-1', verbose = 0):
SaxParser.__init__(self, bufsize, encoding)
self.verbose = verbose
def finish_starttag(self, tag, attrs):
"""uses the HTML DTD to automatically generate events
for missing tags"""
# guess omitted close tags
while self.stack and \
upper(self.stack[-1]) in HTML_OPT_END and \
tag not in HTML_DTD.get(self.stack[-1],[]):
self.unknown_endtag(self.stack[-1])
del self.stack[-1]
if self.stack and tag not in HTML_DTD.get(self.stack[-1],[]) and self.verbose:
print 'Warning : trying to add %s as a child of %s'%\
(tag,self.stack[-1])
self.unknown_starttag(tag,attrs)
if upper(tag) in HTML_FORBIDDEN_END:
# close immediately tags for which we won't get an end
self.unknown_endtag(tag)
return 0
else:
self.stack.append(tag)
return 1
def finish_endtag(self, tag):
if tag in HTML_FORBIDDEN_END :
# do nothing: we've already closed it
return
if tag in self.stack:
while self.stack and self.stack[-1] != tag:
self.unknown_endtag(self.stack[-1])
del self.stack[-1]
self.unknown_endtag(tag)
del self.stack[-1]
elif self.verbose:
print "Warning: I don't see where tag %s was opened"%tag
def handle_data(self,data):
if self.stack:
if '#PCDATA' not in HTML_DTD.get(self.stack[-1],[]) and not strip(data):
# this is probably ignorable whitespace
self._cont_handler.ignorableWhitespace(data)
else:
self._cont_handler.characters(to_xml_string(data,self._encoding))
def close(self):
SGMLParser.close(self)
self.stack.reverse()
for tag in self.stack:
self.unknown_endtag(tag)
self.stack = []
self._cont_handler.endDocument()
def create_parser():
return SaxHtmlParser()