########################################################################
#
# File Name: HtmlLib.py
#
#
"""
Components for reading HTML files using htmllib.py.
WWW: http://4suite.com/4DOM e-mail: support@4suite.com
Copyright (c) 2000 Fourthought Inc, USA. All Rights Reserved.
See http://4suite.com/COPYRIGHT for license and copyright information
"""
import os, urllib
from xml.dom.ext import reader
from xml.dom import Node
import Sgmlop
class Reader(reader.Reader):
def __init__(self):
self.parser = Sgmlop.HtmlParser()
def fromStream(self, stream, ownerDoc=None, charset=''):
self.parser.initParser()
self.parser.initState(ownerDoc, charset)
self.parser.parse(stream)
frag = self.parser.rootNode
if ownerDoc is None:
# Use the created document
doc = frag.ownerDocument
# we have created a new document, and a documentFragment that belongs to this document.
# However, the document is already '
'
# so we need to find out if we have an HTML node in the fragment.
for child in frag.childNodes:
if child.nodeType == Node.ELEMENT_NODE and child.tagName == 'HTML':
# clean up the junk automatically generated by the HTMLDomImplementation
while doc.documentElement.firstChild:
c = doc.documentElement.removeChild(doc.documentElement.firstChild)
self.releaseNode(c)
# copy stuff
while child.firstChild:
doc.documentElement.appendChild(child.firstChild)
self.releaseNode(frag)
return doc
# We are here if we could not find an HTML element in the fragment.
# In this case, we should append everything under the BODY element in the document
body = doc.documentElement.lastChild
body.appendChild(frag)
self.releaseNode(frag)
return doc
else:
# an owner document was passed, we return the fragment, as is.
return frag
def fromUri(self, uri, ownerDoc=None, charset=''):
stream = reader.BASIC_RESOLVER.resolve(uri)
try:
return self.fromStream(stream, ownerDoc, charset)
finally:
stream.close()
def fromString(self, str, ownerDoc=None, charset=''):
stream = reader.StrStream(str)
try:
return self.fromStream(stream, ownerDoc, charset)
finally:
stream.close()
########################## Deprecated ##############################
def FromHtmlStream(fp, ownerDoc=None, charset=''):
return Reader().fromStream(fp, ownerDoc, charset)
def FromHtmlFile(fileName, ownerDoc=None, charset=''):
return Reader().fromUri(fileName, ownerDoc, charset)
def FromHtmlUrl(url, ownerDoc=None, charset=''):
return Reader().fromUri(url, ownerDoc, charset)
def FromHtml(text, ownerDoc=None, charset=''):
return Reader().fromString(text, ownerDoc, charset)