[Home] [By Thread] [By Date] [Recent Entries]
On 2011-05-20 17:34, Julian Reschke wrote:
Hi, Thanks for all the feedback. In the end I went for a pure XSLT2 implementation, supporting ISO-8859-1 and UTF-8. See below. I'm doing a lot of XSLT 1.0 but not so much XSLT 2.0, so comments on how to make this more elegant are welcome. XSLT (to be applied to some random XML): <?xml version="1.0" encoding="ISO-8859-1"?>
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
version="2.0"
xmlns:xs="http://www.w3.org/2001/XMLSchema"xmlns:myns="mailto:julian.reschke@xxxxxxxxxxxxx?subject=pctdecode" exclude-result-prefixes="myns" > <xsl:output method="xml" indent="yes"/> <xsl:template match="/">
<results>
<xsl:copy-of select="myns:test('utf-8','A%20C')"/>
<xsl:copy-of select="myns:test('iso-8859-1','A%20C')"/>
<xsl:copy-of select="myns:test('utf-8','A%C3%A4')"/>
<xsl:copy-of select="myns:test('iso-8859-1','A%E4')"/>
<xsl:copy-of select="myns:test('utf-8','A%E4')"/>
</results>
</xsl:template><xsl:function name="myns:test"> <xsl:param name="enc"/> <xsl:param name="value"/> <result>
<input>
<enc><xsl:value-of select="$enc"/></enc>
<value><xsl:value-of select="$value"/></value>
</input>
<parsed>
<xsl:variable name="raw" select="myns:pct-decode($value)"/> <xsl:choose>
<xsl:when test="lower-case($enc)='iso-8859-1'">
<xsl:copy-of select="myns:decode-iso-8859-1($raw)"/>
</xsl:when>
<xsl:when test="lower-case($enc)='utf-8'">
<xsl:copy-of select="myns:decode-utf-8($raw)"/>
</xsl:when>
<xsl:otherwise>
<!-- unsupported encoding -->
</xsl:otherwise>
</xsl:choose>
</parsed>
</result>
</xsl:function><xsl:variable name="attr-char">!#\$&\+\-\.\^_`\|~<xsl:value-of select="$DIGIT"/><xsl:value-of select="$ALPHA"/></xsl:variable> <xsl:variable name="DIGIT">0-9</xsl:variable> <xsl:variable name="ALPHA">a-zA-Z</xsl:variable> <xsl:variable name="HEXDIG">a-fA-F<xsl:value-of select="$DIGIT"/></xsl:variable> <xsl:variable name="pct-encoded">%[<xsl:value-of select="$HEXDIG"/>][<xsl:value-of select="$HEXDIG"/>]</xsl:variable> <xsl:function name="myns:pct-decode"> <xsl:param name="s"/> <xsl:variable name="reg">(<xsl:value-of select="$pct-encoded"/>)|[<xsl:value-of select="$attr-char"/>]</xsl:variable> <xsl:variable name="digits" select="('0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F')"/> <xsl:analyze-string select="$s" regex="{$reg}" flags="mx"> <xsl:matching-substring> <xsl:choose> <xsl:when test="starts-with(.,'%')"> <xsl:variable name="a" select="index-of($digits,upper-case(substring(.,2,1)))-1"/> <xsl:variable name="b" select="index-of($digits,upper-case(substring(.,3,1)))-1"/> <xsl:variable name="cp" select="$a * 16 + $b"/> <xsl:choose> <xsl:when test="$cp >= 128"> <octet><xsl:value-of select="$cp"/></octet> </xsl:when> <xsl:otherwise> <c><xsl:value-of select="codepoints-to-string($cp)"/></c> </xsl:otherwise> </xsl:choose> </xsl:when> <xsl:otherwise> <!-- single character --> <c><xsl:value-of select="."/></c> </xsl:otherwise> </xsl:choose> </xsl:matching-substring> </xsl:analyze-string> </xsl:function> <xsl:function name="myns:decode-iso-8859-1"> <xsl:param name="s"/> <xsl:variable name="result">
<xsl:for-each select="$s">
<xsl:choose>
<xsl:when test="self::octet">
<xsl:choose>
<xsl:when test=". > 127 and . < 160">
<illegal-octet><xsl:value-of select="."/></illegal-octet>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="codepoints-to-string(.)"/>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="."/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</xsl:variable><xsl:choose> <xsl:when test="$result/illegal-octet"> <illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet> </xsl:when> <xsl:otherwise> <string><xsl:value-of select="$result"/></string> </xsl:otherwise> </xsl:choose> </xsl:function> <xsl:function name="myns:decode-utf-8"> <xsl:param name="s"/> <xsl:variable name="octets"> <xsl:for-each select="$s"> <o> <xsl:choose> <xsl:when test="self::octet"><xsl:value-of select="."/></xsl:when> <xsl:otherwise><xsl:value-of select="string-to-codepoints(.)"/></xsl:otherwise> </xsl:choose> </o> </xsl:for-each> </xsl:variable> <xsl:variable name="result">
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets/*"/>
</xsl:call-template>
</xsl:variable><xsl:choose> <xsl:when test="$result/illegal-octet"> <illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet> </xsl:when> <xsl:otherwise> <string><xsl:value-of select="$result"/></string> </xsl:otherwise> </xsl:choose> </xsl:function> <xsl:template name="internal-utf8"> <xsl:param name="octets"/> <xsl:choose> <xsl:when test="not($octets)"><!--done--></xsl:when> <xsl:when test="count($octets) >= 4 and $octets[1] >= 240 and $octets[2] >= 128 and $octets[3] >= 128 and $octets[4] >= 128"> <xsl:value-of select="codepoints-to-string(xs:integer(((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64) * 64) + ($octets[4] mod 64)))"/> <xsl:call-template name="internal-utf8"> <xsl:with-param name="octets" select="$octets[position() > 4]"/> </xsl:call-template> </xsl:when> <xsl:when test="count($octets) >= 3 and $octets[1] >= 224 and $octets[2] >= 128 and $octets[3] >= 128"> <xsl:value-of select="codepoints-to-string(xs:integer((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64)))"/> <xsl:call-template name="internal-utf8"> <xsl:with-param name="octets" select="$octets[position() > 3]"/> </xsl:call-template> </xsl:when> <xsl:when test="count($octets) >= 2 and $octets[1] >= 192 and $octets[2] >= 128"> <xsl:value-of select="codepoints-to-string(xs:integer((($octets[1] mod 32) * 64) + ($octets[2] mod 64)))"/> <xsl:call-template name="internal-utf8"> <xsl:with-param name="octets" select="$octets[position() > 2]"/> </xsl:call-template> </xsl:when> <xsl:when test="$octets[1] < 128"> <xsl:value-of select="codepoints-to-string($octets[1])"/> <xsl:call-template name="internal-utf8"> <xsl:with-param name="octets" select="$octets[position() > 1]"/> </xsl:call-template> </xsl:when> <xsl:otherwise> <illegal-octet><xsl:value-of select="$octets[1]"/></illegal-octet> </xsl:otherwise> </xsl:choose> </xsl:template> </xsl:transform> Output: <?xml version="1.0" encoding="UTF-8"?>
<results xmlns:xs="http://www.w3.org/2001/XMLSchema">
<result>
<input>
<enc>utf-8</enc>
<value>A%20C</value>
</input>
<parsed>
<string>A C</string>
</parsed>
</result>
<result>
<input>
<enc>iso-8859-1</enc>
<value>A%20C</value>
</input>
<parsed>
<string>A C</string>
</parsed>
</result>
<result>
<input>
<enc>utf-8</enc>
<value>A%C3%A4</value>
</input>
<parsed>
<string>Ad</string>
</parsed>
</result>
<result>
<input>
<enc>iso-8859-1</enc>
<value>A%E4</value>
</input>
<parsed>
<string>Ad</string>
</parsed>
</result>
<result>
<input>
<enc>utf-8</enc>
<value>A%E4</value>
</input>
<parsed>
<illegal-octet>228</illegal-octet>
</parsed>
</result>
</results>Best regards, Julian
|

Cart



