<
p:library xmlns:p=
"http://www.w3.org/ns/xproc" xmlns:px=
"http://www.daisy.org/ns/pipeline/xproc" xmlns:html=
"http://www.w3.org/1999/xhtml" xmlns:ssml=
"http://www.w3.org/2001/10/synthesis" version=
"1.0">
<
p:declare-step type=
"px:html-break-detect">
<
p:documentation>Break an input XHTML document into words and sentences by inserting word and sentence elements.</
p:documentation>
<
p:input port=
"source" primary=
"true"/>
<
p:output port=
"result" primary=
"true"/>
<
p:output port=
"sentence-ids">
<
p:pipe port=
"sentence-ids" step=
"generic"/>
</
p:output>
<
p:option name=
"sentence-attr" required=
"false" select=
"''">
<
p:documentation xmlns=
"http://www.w3.org/1999/xhtml">
<
p>Attribute to be added to sentence <
code>span</
code> elements.</
p>
</
p:documentation>
</
p:option>
<
p:option name=
"sentence-attr-val" required=
"false" select=
"''">
<
p:documentation xmlns=
"http://www.w3.org/1999/xhtml">
<
p>Corresponding attribute value.</
p>
<
p><
code>span</
code> elements in the input that are already marked up as sentences using the
specified attribute and value are preserved. No additional sentence detection is performed
within such elements.</
p>
</
p:documentation>
</
p:option>
<
p:import href=
"http://www.daisy.org/pipeline/modules/nlp-common/library.xpl">
<
p:documentation>
px:break-and-reshape
</
p:documentation>
</
p:import>
<
p:variable name=
"existing-sentence-match-pattern" select=
"if ($sentence-attr!='') then concat('html:span[@',$sentence-attr,'="', replace($sentence-attr-val,'"','""'), '"]') else ''"/>
<
px:break-and-reshape name=
"generic" inline-tags=
"html:span|html:i|html:b|html:a|html:br|html:del|html:font|html:ruby|html:s| html:small|html:strike|html:strong|html:sup|html:sub|html:u|html:q|html:address| html:abbr|html:em|html:style|ssml:phoneme" ensure-word-before=
"html:span|html:br|html:ruby|html:s|html:address|html:abbr|html:style" ensure-word-after=
"span|html:br|html:ruby|html:s|html:address|html:abbr|html:style" can-contain-sentences=
"html:body|html:section|html:nav|html:article|html:aside|html:h1|html:h2| html:h3|html:h4|html:h5|html:h6|html:header|html:footer|html:address| html:p|html:pre|html:blockquote|html:li|html:dt|html:dd|html:a|html:q| html:cite|html:em|html:strong|html:small|html:mark|html:dfn|html:abbr| html:time|html:progress|html:meter|html:code|html:var|html:samp|html:kdb| html:sub|html:sup|html:span|html:i|html:b|html:bdo|html:rt|html:ins| html:del|html:caption|html:figcaption|html:td|html:th|html:form|html:label| html:input|html:button|html:datalist|html:output|html:bb|html:menu| html:legend|html:div" output-ns=
"http://www.w3.org/1999/xhtml" output-word-tag=
"span" word-attr=
"role" word-attr-val=
"word" output-sentence-tag=
"span" output-subsentence-tag=
"span" exclusive-sentence-tag=
"false" exclusive-word-tag=
"false">
<
p:with-option name=
"sentence-attr" select=
"$sentence-attr"/>
<
p:with-option name=
"sentence-attr-val" select=
"$sentence-attr-val"/>
<
p:with-option name=
"special-sentences" select=
"$existing-sentence-match-pattern"/>
<
p:with-option name=
"cannot-be-sentence-child" select=
"if ($existing-sentence-match-pattern!='') then concat('*[descendant-or-self::',$existing-sentence-match-pattern,']') else ''"/>
</
px:break-and-reshape>
</
p:declare-step>
<
p:declare-step type=
"px:html-unwrap-words">
<
p:documentation>Remove the word markups from the input document.</
p:documentation>
<
p:input port=
"source" primary=
"true"/>
<
p:output port=
"result" primary=
"true"/>
<
p:unwrap match=
"html:span[@role='word' and not(@* except @role)]"/>
<
p:delete match=
"html:span/@role[.='word']"/>
</
p:declare-step>
</
p:library>