<?xml encoding="cp-1252" ?>

<!-- added to deal with characters from 0x80 to 0x9f of cp_1252 code page.
     When set to true, numeric reference with values from 0x80 to 0x9f will
     map to the cp_1252 characters. O.W. numeric references refer to ISO10646
     code points. -->      

<favor-cp1252-numeric-references key = "true" />

<maximum-segment-size>
	<item key = "131072" />
</maximum-segment-size>

<!-- Before reaching the segment size limit, we reserve a buffer of this
     size and search for the first space in the text regions or the first 
     non-embedded markup to end the segment.  -->

<segment-boundary-threshold key = "1024" />


<!-- added the following to deal with the problem of having the markup tree 
     getting too deep.  After the markup tree gets pass the depth of the 
     specified value, we throw an unbalanced_markup exception.

     We pick the default value as specified thinking that a well-formed HTML
     document is very unlikely to contain markup tags beyond the configurable
     depth. In the case of bad input, we throw an exception that the caller
     may catch. Possible recovery strategies include analyzing the document
     as plain text, discaring the document, or repairing the nested markup. -->
    
<maximum-markup-depth>
        <item key = "200" />
</maximum-markup-depth>

<!-- The following section listed markup elements that can be embedded in the token
     and not break the token (e.g. <b>H</b>elp ==> Help). This list may not overlap 
     with the remove-markup list, remove-region list, or the segment-markup list.   
     (Note that elements not listed here will break text units)  -->
    
<embedded-markup>
        <item key = "b" />
        <item key = "big" />
        <item key = "font" />
        <item key = "i" />
        <item key = "s" />
        <item key = "small" />
        <item key = "strike" />
        <item key = "tt" />           
        <item key = "u" />      
</embedded-markup>  


<!-- Empty markup elements. These elements do not have any content -->

<empty-markup>
        <item key = "area" />
        <item key = "base" />
        <item key = "basefont" />
        <item key = "bgsound" />
        <item key = "br" />
        <item key = "col" />
        <item key = "embed" />
        <item key = "frame" />
        <item key = "hr" />
        <item key = "img" />
        <item key = "input" />
        <item key = "isindex" />
        <item key = "keygen" />
        <item key = "link" />
        <item key = "meta" />
        <item key = "param" />
        <item key = "spacer" />
        <item key = "wbr" />
</empty-markup>


<!-- Content of these elements is treated as literals. Any nested markup elements 
     will be ignored. When the start tag is seen, the html processor will scan the content
     until it reaches the corresponding end tag or EOF. These tags may not be embedded tags -->

<multimedia-markup>
        <item key = "applet" />
        <item key = "code" />
        <item key = "script" />
        <item key = "server" />   
</multimedia-markup>


<!-- The following section listed markup elements that will break a segment. 
     This list may not overlap with the embedded-markup, remove-markup, or 
     remove-region list. The priority attribute value must be a natual number
     (i.e. any number > 0). Larger number means higher priority --> 

<segment-markup>
        <item key = "address" priority = "1" />
        <item key = "blockquote" priority = "1" /> 
        <item key = "body" priority = "1" /> 
        <item key = "center" priority = "1" />
        <item key = "dir" priority = "1" />
        <item key = "div" priority = "1" />
        <item key = "dl" priority = "1" />
        <item key = "fieldset" priority = "1" />
        <item key = "form" priority = "1" />
        <item key = "h1" priority = "1" />
        <item key = "h2" priority = "1" />
        <item key = "h3" priority = "1" />
        <item key = "h4" priority = "1" />
        <item key = "h5" priority = "1" />
        <item key = "h6" priority = "1" />
        <item key = "head" priority = "1" />
        <item key = "hr" priority = "1" />
        <item key = "html" priority = "1" />
        <item key = "menu" priority = "1" />
        <item key = "multicol" priority = "1" />
        <item key = "ol" priority = "1" />
        <item key = "p" priority = "1" />
        <item key = "pre" priority = "1" />
        <item key = "table" priority = "1" />
        <item key = "ul" priority = "1" />
</segment-markup>


<!-- The following markup elements will not appear in the markup tree.
     Their content WILL be kept in the text unit list of the segment.
     This list may not overlap with the embedded-markup list, 
     segment-markup list, or the remove-region list -->
          
<remove-markup>
</remove-markup>


<!-- The following markup elements and their content are removed from the segment.
     Elements list here is exclusive from the embedded-markup list, the remove-markup list,
     or the segment-markup list  -->
      
<remove-region>
        <item key = "!" />
        <item key = "applet" />
        <item key = "code" />
        <item key = "iframe" />
        <item key = "layer" />
        <item key = "noembed" />
        <item key = "noframes" />
        <item key = "nolayer" />
        <item key = "noscript" />
        <item key = "object" />
        <item key = "script" />
        <item key = "select" />
        <item key = "server" />
        <item key = "style" />
        <item key = "textarea" />
        <item key = "title" />
</remove-region>


<!-- ************************************************************************************ -->
<!-- The following section listed markup elements with optional end tags.
     The key value of the <list> is the markup with optional end tag.
     The key values of the <item>s are the optional end tags of the key value
     in the enclosing list -->

<optional-end-tag>
        <list key = "colgroup">
                <item key = "colgroup" />
                <item key = "tbody" />
                <item key = "tfoot" />
                <item key = "thead" />
                <item key = "tr" />
        </list>
        <list key = "dd">
                <item key = "dd" />
                <item key = "dt" />
        </list>
        <list key = "dt">
                <item key = "dd" />
                <item key = "dt" />
        </list>
        <list key = "head">
                <item key = "body" />
                <item key = "frameset" />
        </list>
        <list key = "li">
                <item key = "li" />
        </list>
        <list key = "option">
                <item key = "option" />
                <item key = "optgroup" />
        </list>
        <list key = "p">
                <item key = "address" />
                <item key = "blockquote" />
                <item key = "body" />
                <item key = "center" />
                <item key = "dir" />
                <item key = "div" />
                <item key = "dl" />
                <item key = "fieldset" />  
                <item key = "form" />
                <item key = "h1" />
                <item key = "h2" />
                <item key = "h3" />
                <item key = "h4" />
                <item key = "h5" />    
                <item key = "h6" />
                <item key = "head" />
                <item key = "hr" />
                <item key = "menu" />
                <item key = "multicol" />
                <item key = "ol" />
                <item key = "p" />
                <item key = "pre" />
                <item key = "table" />
                <item key = "ul" />
        </list>
        <list key = "tbody">
                <item key = "tbody" />
                <item key = "tfoot" />
                <item key = "thead" />
        </list>
        <list key = "td">
                <item key = "td" />
                <item key = "th" />
                <item key = "tr" />
        </list>
        <list key = "tfoot">
                <item key = "tbody" />
                <item key = "tfoot" />
                <item key = "thead" />
        </list>
        <list key = "th">
                <item key = "td" />
                <item key = "th" />
                <item key = "tr" />
        </list>
        <list key = "thead">
                <item key = "tbody" />
                <item key = "tfoot" />
                <item key = "thead" />
        </list>
        <list key = "tr">
                <item key = "tbody" />
                <item key = "tfoot" />
                <item key = "thead" />
                <item key = "tr" />
        </list>          
</optional-end-tag>