|
|||||||||||||||||||
| Source file | Conditionals | Statements | Methods | TOTAL | |||||||||||||||
| EncodingHeuristics.java | 77.8% | 90.5% | 66.7% | 85.8% |
|
||||||||||||||
| 1 | /* Copyright 2002, 2003, 2005 Elliotte Rusty Harold | |
| 2 | ||
| 3 | This library is free software; you can redistribute it and/or modify | |
| 4 | it under the terms of version 2.1 of the GNU Lesser General Public | |
| 5 | License as published by the Free Software Foundation. | |
| 6 | ||
| 7 | This library is distributed in the hope that it will be useful, | |
| 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 10 | GNU Lesser General Public License for more details. | |
| 11 | ||
| 12 | You should have received a copy of the GNU Lesser General Public | |
| 13 | License along with this library; if not, write to the | |
| 14 | Free Software Foundation, Inc., 59 Temple Place, Suite 330, | |
| 15 | Boston, MA 02111-1307 USA | |
| 16 | ||
| 17 | You can contact Elliotte Rusty Harold by sending e-mail to | |
| 18 | elharo@metalab.unc.edu. Please include the word "XOM" in the | |
| 19 | subject line. The XOM home page is located at http://www.xom.nu/ | |
| 20 | */ | |
| 21 | ||
| 22 | package nu.xom.xinclude; | |
| 23 | ||
| 24 | import java.io.IOException; | |
| 25 | import java.io.InputStream; | |
| 26 | ||
| 27 | /** | |
| 28 | * <p> | |
| 29 | * <code>EncodingHeuristics</code> reads from a stream | |
| 30 | * (which should be buffered) and attempts to guess | |
| 31 | * what the encoding of the text in the stream is. | |
| 32 | * Byte order marks are stripped from the stream. | |
| 33 | * If it fails to determine the type of the encoding, | |
| 34 | * it returns the default UTF-8. | |
| 35 | * </p> | |
| 36 | * | |
| 37 | * | |
| 38 | * @author Elliotte Rusty Harold | |
| 39 | * @version 1.0 | |
| 40 | */ | |
| 41 | class EncodingHeuristics { | |
| 42 | ||
| 43 | // No instances allowed | |
| 44 | 0 | private EncodingHeuristics() {} |
| 45 | ||
| 46 | ||
| 47 | /** | |
| 48 | * <p> | |
| 49 | * This utility method uses a variety of heuristics to | |
| 50 | * attempt to guess the encoding from the initial | |
| 51 | * characters. | |
| 52 | * </p> | |
| 53 | * | |
| 54 | * @param in <code>InputStream</code> to read from. | |
| 55 | * @return String The name of the encoding. | |
| 56 | * @throws IOException if the stream cannot be reset back | |
| 57 | * to where it was when the method was invoked. | |
| 58 | */ | |
| 59 | 24 | public static String readEncodingFromStream(InputStream in) |
| 60 | throws IOException { | |
| 61 | ||
| 62 | // This may fail if there are a lot of space | |
| 63 | // characters before the end of the encoding declaration | |
| 64 | 24 | in.mark(1024); |
| 65 | ||
| 66 | 24 | try { |
| 67 | // Lots of things can go wrong here. If any do, | |
| 68 | // return "UTF-8" as the default. | |
| 69 | 24 | int byte1 = in.read(); |
| 70 | 24 | int byte2 = in.read(); |
| 71 | 24 | if (byte1 == 0xFE && byte2 == 0xFF) { |
| 72 | // Don't reset because the byte order mark should not be | |
| 73 | // included per section 4.3 of the XInclude spec | |
| 74 | 1 | return "UnicodeBig"; |
| 75 | } | |
| 76 | 23 | else if (byte1 == 0xFF && byte2 == 0xFE) { |
| 77 | // Don't reset because the byte order mark should not be | |
| 78 | // included per section 4.3 of the XInclude spec | |
| 79 | 1 | return "UnicodeLittle"; |
| 80 | } | |
| 81 | ||
| 82 | /* In accordance with the Character Model, | |
| 83 | when the text format is a Unicode encoding, the XInclude | |
| 84 | processor must fail the inclusion when the text in the | |
| 85 | selected range is non-normalized. When transcoding | |
| 86 | characters to a Unicode encoding from a legacy encoding, | |
| 87 | a normalizing transcoder must be used. */ | |
| 88 | ||
| 89 | 22 | int byte3 = in.read(); |
| 90 | // check for UTF-8 byte order mark | |
| 91 | 22 | if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) { |
| 92 | // Don't reset because the byte order mark should not be | |
| 93 | // included per section 4.3 of the XInclude spec | |
| 94 | 1 | return "UTF-8"; |
| 95 | } | |
| 96 | ||
| 97 | 21 | int byte4 = in.read(); |
| 98 | 21 | if (byte1 == 0x00 |
| 99 | && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) { | |
| 100 | // Don't reset because the byte order mark should not be | |
| 101 | // included per section 4.3 of the XInclude spec | |
| 102 | // Most Java VMs don't support this next one | |
| 103 | 0 | return "UTF32BE"; |
| 104 | } | |
| 105 | 21 | else if (byte1 == 0x00 && byte2 == 0x00 |
| 106 | && byte3 == 0xFF && byte4 == 0xFE) { | |
| 107 | // Don't reset because the byte order mark should not be | |
| 108 | // included per section 4.3 of the XInclude spec | |
| 109 | // Most Java VMs don't support this next one | |
| 110 | 0 | return "UTF32LE"; |
| 111 | } | |
| 112 | ||
| 113 | // no byte order mark present; first character must be | |
| 114 | // less than sign or white space | |
| 115 | // Let's look for less-than signs first | |
| 116 | 21 | if (byte1 == 0x00 && byte2 == 0x00 |
| 117 | && byte3 == 0x00 && byte4 == '<') { | |
| 118 | 0 | in.reset(); |
| 119 | 0 | return "UTF32BE"; |
| 120 | } | |
| 121 | 21 | else if (byte1 == '<' && byte2 == 0x00 |
| 122 | && byte3 == 0x00 && byte4 == 0x00) { | |
| 123 | 0 | in.reset(); |
| 124 | 0 | return "UTF32LE"; |
| 125 | } | |
| 126 | 21 | else if (byte1 == 0x00 && byte2 == '<' |
| 127 | && byte3 == 0x00 && byte4 == '?') { | |
| 128 | 1 | in.reset(); |
| 129 | 1 | return "UnicodeBigUnmarked"; |
| 130 | } | |
| 131 | 20 | else if (byte1 == '<' && byte2 == 0x00 |
| 132 | && byte3 == '?' && byte4 == 0x00) { | |
| 133 | 1 | in.reset(); |
| 134 | 1 | return "UnicodeLittleUnmarked"; |
| 135 | } | |
| 136 | 19 | else if (byte1 == '<' && byte2 == '?' |
| 137 | && byte3 == 'x' && byte4 == 'm') { | |
| 138 | // ASCII compatible, must read encoding declaration. | |
| 139 | // 1024 bytes will be far enough to read most | |
| 140 | // XML declarations | |
| 141 | 2 | byte[] data = new byte[1024]; |
| 142 | 2 | data[0] = (byte) byte1; |
| 143 | 2 | data[1] = (byte) byte2; |
| 144 | 2 | data[2] = (byte) byte3; |
| 145 | 2 | data[3] = (byte) byte4; |
| 146 | 2 | int length = in.read(data, 4, 1020) + 4; |
| 147 | // Use Latin-1 (ISO-8859-1) because it's ASCII compatible | |
| 148 | // and all byte sequences are legal Latin-1 sequences | |
| 149 | // so I don't have to worry about encoding errors if I | |
| 150 | // slip past the end of the XML/text declaration | |
| 151 | 2 | String declaration=new String(data, 0, length, "8859_1"); |
| 152 | // If any of these throw a | |
| 153 | // StringIndexOutOfBoundsException, | |
| 154 | // we just fall into the catch block and return null | |
| 155 | // since this can't be well-formed XML | |
| 156 | 2 | String encoding = findEncodingDeclaration(declaration); |
| 157 | 1 | in.reset(); |
| 158 | 1 | return encoding; |
| 159 | ||
| 160 | } | |
| 161 | 17 | else if (byte1 == 0x4C && byte2 == 0x6F |
| 162 | && byte3 == 0xA7 && byte4 == 0x94) { | |
| 163 | // EBCDIC compatible, must read encoding declaration | |
| 164 | 1 | byte[] buffer = new byte[1016]; |
| 165 | 47 | for (int i = 0; i < buffer.length; i++) { |
| 166 | 47 | int c = in.read(); |
| 167 | 1 | if (c == -1) break; |
| 168 | 46 | buffer[i] = (byte) c; |
| 169 | } | |
| 170 | 1 | in.reset(); |
| 171 | // Most EBCDIC encodings are compatible with Cp037 over | |
| 172 | // the range we care about | |
| 173 | 1 | return findEncodingDeclaration(new String(buffer, "Cp037")); |
| 174 | } | |
| 175 | ||
| 176 | } | |
| 177 | catch (Exception ex) { | |
| 178 | 1 | in.reset(); |
| 179 | 1 | return "UTF-8"; |
| 180 | } | |
| 181 | ||
| 182 | // no XML or text declaration present | |
| 183 | 16 | in.reset(); |
| 184 | 16 | return "UTF-8"; |
| 185 | ||
| 186 | } | |
| 187 | ||
| 188 | ||
| 189 | 3 | private static String findEncodingDeclaration(String declaration) |
| 190 | throws IOException { | |
| 191 | ||
| 192 | 3 | int position = declaration.indexOf("encoding") + 8; |
| 193 | 3 | char c; |
| 194 | // get rid of white space before equals sign | |
| 195 | 3 | while (true) { |
| 196 | 3 | c = declaration.charAt(position++); |
| 197 | 3 | if (c !=' ' && c != '\t' && c != '\r' && c != '\n') { |
| 198 | 3 | break; |
| 199 | } | |
| 200 | } | |
| 201 | 3 | if (c != '=') { // malformed |
| 202 | 1 | throw new IOException("Couldn't determine encoding"); |
| 203 | } | |
| 204 | // get rid of white space after equals sign | |
| 205 | 2 | while (true) { |
| 206 | 2 | c = declaration.charAt(position++); |
| 207 | 2 | if (c !=' ' && c != '\t' && c != '\r' && c != '\n') { |
| 208 | 2 | break; |
| 209 | } | |
| 210 | } | |
| 211 | 2 | char delimiter = c; |
| 212 | 2 | if (delimiter != '\'' && delimiter != '"') { // malformed |
| 213 | 0 | return "UTF-8"; |
| 214 | } | |
| 215 | // now positioned to read encoding name | |
| 216 | 2 | StringBuffer encodingName = new StringBuffer(); |
| 217 | 2 | while (true) { |
| 218 | 12 | c = declaration.charAt(position++); |
| 219 | 2 | if (c == delimiter) break; |
| 220 | 10 | encodingName.append(c); |
| 221 | } | |
| 222 | 2 | return encodingName.toString(); |
| 223 | ||
| 224 | } | |
| 225 | ||
| 226 | } |
|
||||||||||