001 /*
002 * This file is part of the Jikes RVM project (http://jikesrvm.org).
003 *
004 * This file is licensed to You under the Eclipse Public License (EPL);
005 * You may not use this file except in compliance with the License. You
006 * may obtain a copy of the License at
007 *
008 * http://www.opensource.org/licenses/eclipse-1.0.php
009 *
010 * See the COPYRIGHT.txt file distributed with this work for information
011 * regarding copyright ownership.
012 */
013 package org.jikesrvm.classloader;
014
015 import java.io.UTFDataFormatException;
016 import java.nio.ByteBuffer;
017 import org.vmmagic.pragma.Pure;
018 import org.jikesrvm.VM;
019 import org.vmmagic.pragma.Inline;
020 import org.vmmagic.pragma.NoInline;
021
022 /**
023 * UTF8Convert
024 *
025 * Abstract class that contains conversion routines to/from utf8
026 * and/or pseudo-utf8. It does not support utf8 encodings of
027 * more than 3 bytes.
028 *
029 * The difference between utf8 and pseudo-utf8 is the special
030 * treatment of null. In utf8, null is encoded as a single byte
031 * directly, whereas in pseudo-utf8, it is encoded as a two-byte
032 * sequence. See the JVM spec for more information.
033 */
034 public abstract class UTF8Convert {
035
036 /**
037 * Strictly check the format of the utf8/pseudo-utf8 byte array in
038 * fromUTF8.
039 */
040 static final boolean STRICTLY_CHECK_FORMAT = false;
041 /**
042 * Set fromUTF8 to not throw an exception when given a normal utf8
043 * byte array.
044 */
045 static final boolean ALLOW_NORMAL_UTF8 = false;
046 /**
047 * Set fromUTF8 to not throw an exception when given a pseudo utf8
048 * byte array.
049 */
050 static final boolean ALLOW_PSEUDO_UTF8 = true;
051 /**
052 * Set toUTF8 to write in pseudo-utf8 (rather than normal utf8).
053 */
054 static final boolean WRITE_PSEUDO_UTF8 = true;
055
056 /**
057 * UTF8 character visitor abstraction
058 */
059 private abstract static class UTF8CharacterVisitor {
060 abstract void visit_char(char c);
061 }
062
063 /**
064 * Visitor that builds up a char[] as characters are decoded
065 */
066 private static final class ByteArrayStringEncoderVisitor extends UTF8CharacterVisitor {
067 final char[] result;
068 int index;
069 ByteArrayStringEncoderVisitor(int length) {
070 result = new char[length];
071 index = 0;
072 }
073 void visit_char(char c) {
074 result[index] = c;
075 index++;
076 }
077 public String toString() {
078 if (VM.runningVM) {
079 return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
080 } else {
081 return new String(result, 0, index);
082 }
083 }
084 }
085
086 /**
087 * Visitor that builds up a char[] as characters are decoded
088 */
089 private static final class ByteBufferStringEncoderVisitor extends UTF8CharacterVisitor {
090 final char[] result;
091 int index;
092 ByteBufferStringEncoderVisitor(int length) {
093 result = new char[length];
094 index = 0;
095 }
096 void visit_char(char c) {
097 result[index] = c;
098 index++;
099 }
100 public String toString() {
101 if (VM.runningVM) {
102 return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
103 } else {
104 return new String(result, 0, index);
105 }
106 }
107 }
108
109 /**
110 * Visitor that builds up a String.hashCode form hashCode as characters are decoded
111 */
112 private static final class StringHashCodeVisitor extends UTF8CharacterVisitor {
113 int result = 0;
114 void visit_char(char c) {
115 result = result * 31 + c;
116 }
117 int getResult() {
118 return result;
119 }
120 }
121
122 /**
123 * Convert the given sequence of (pseudo-)utf8 formatted bytes
124 * into a String.
125 *
126 * The acceptable input formats are controlled by the
127 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
128 * flags.
129 *
130 * @param utf8 (pseudo-)utf8 byte array
131 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
132 * @return unicode string
133 */
134 public static String fromUTF8(byte[] utf8) throws UTFDataFormatException {
135 UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length);
136 visitUTF8(utf8, visitor);
137 return visitor.toString();
138 }
139
140 /**
141 * Convert the given sequence of (pseudo-)utf8 formatted bytes
142 * into a String.
143 *
144 * The acceptable input formats are controlled by the
145 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
146 * flags.
147 *
148 * @param utf8 (pseudo-)utf8 byte array
149 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
150 * @return unicode string
151 */
152 public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException {
153 UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining());
154 visitUTF8(utf8, visitor);
155 return visitor.toString();
156 }
157
158 /**
159 * Convert the given sequence of (pseudo-)utf8 formatted bytes
160 * into a String hashCode.
161 *
162 * The acceptable input formats are controlled by the
163 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
164 * flags.
165 *
166 * @param utf8 (pseudo-)utf8 byte array
167 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
168 * @return hashCode corresponding to if this were a String.hashCode
169 */
170 public static int computeStringHashCode(byte[] utf8) throws UTFDataFormatException {
171 StringHashCodeVisitor visitor = new StringHashCodeVisitor();
172 visitUTF8(utf8, visitor);
173 return visitor.getResult();
174 }
175
176 /**
177 * Generate exception messages without bloating code
178 */
179 @NoInline
180 private static void throwDataFormatException(String message, int location) throws UTFDataFormatException {
181 throw new UTFDataFormatException(message + " at location " + location);
182 }
183
184 /**
185 * Visit all bytes of the given utf8 string calling the visitor when a
186 * character is decoded.
187 *
188 * The acceptable input formats are controlled by the
189 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
190 * flags.
191 *
192 * @param utf8 (pseudo-)utf8 byte array
193 * @param visitor called when characters are decoded
194 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
195 */
196 @Inline
197 private static void visitUTF8(byte[] utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
198 for (int i = 0, n = utf8.length; i < n;) {
199 byte b = utf8[i++];
200 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
201 if (b == 0) {
202 throwDataFormatException("0 byte encountered", i-1);
203 }
204 }
205 if (b >= 0) { // < 0x80 unsigned
206 // in the range '\001' to '\177'
207 visitor.visit_char((char) b);
208 continue;
209 }
210 try {
211 byte nb = utf8[i++];
212 if (b < -32) { // < 0xe0 unsigned
213 // '\000' or in the range '\200' to '\u07FF'
214 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
215 visitor.visit_char(c);
216 if (STRICTLY_CHECK_FORMAT) {
217 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
218 throwDataFormatException("invalid marker bits for double byte char" , i-2);
219 }
220 if (c < '\200') {
221 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
222 throwDataFormatException("encountered double byte char that should have been single byte", i-2);
223 }
224 } else if (c > '\u07FF') {
225 throwDataFormatException("encountered double byte char that should have been single byte", i-2);
226 }
227 }
228 } else {
229 byte nnb = utf8[i++];
230 // in the range '\u0800' to '\uFFFF'
231 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
232 visitor.visit_char(c);
233 if (STRICTLY_CHECK_FORMAT) {
234 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
235 throwDataFormatException("invalid marker bits for triple byte char", i - 3);
236 }
237 if (c < '\u0800') {
238 throwDataFormatException("encountered triple byte char that should have been fewer bytes", i - 3);
239 }
240 }
241 }
242 } catch (ArrayIndexOutOfBoundsException e) {
243 throwDataFormatException("unexpected end", i);
244 }
245 }
246 }
247
248 /**
249 * Visit all bytes of the given utf8 string calling the visitor when a
250 * character is decoded.
251 *
252 * The acceptable input formats are controlled by the
253 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
254 * flags.
255 *
256 * @param utf8 (pseudo-)utf8 byte array
257 * @param visitor called when characters are decoded
258 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
259 */
260 @Inline
261 private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
262 while (utf8.hasRemaining()) {
263 byte b = utf8.get();
264 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
265 if (b == 0) {
266 throwDataFormatException("0 byte encountered", utf8.position() - 1);
267 }
268 }
269 if (b >= 0) { // < 0x80 unsigned
270 // in the range '\001' to '\177'
271 visitor.visit_char((char) b);
272 continue;
273 }
274 try {
275 byte nb = utf8.get();
276 if (b < -32) { // < 0xe0 unsigned
277 // '\000' or in the range '\200' to '\u07FF'
278 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
279 visitor.visit_char(c);
280 if (STRICTLY_CHECK_FORMAT) {
281 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
282 throwDataFormatException("invalid marker bits for double byte char", utf8.position() - 2);
283 }
284 if (c < '\200') {
285 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
286 throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
287 }
288 } else if (c > '\u07FF') {
289 throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
290 }
291 }
292 } else {
293 byte nnb = utf8.get();
294 // in the range '\u0800' to '\uFFFF'
295 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
296 visitor.visit_char(c);
297 if (STRICTLY_CHECK_FORMAT) {
298 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
299 throwDataFormatException("invalid marker bits for triple byte char", utf8.position() - 3);
300 }
301 if (c < '\u0800') {
302 throwDataFormatException("encountered triple byte char that should have been fewer bytes", utf8.position() - 3);
303 }
304 }
305 }
306 } catch (ArrayIndexOutOfBoundsException e) {
307 throwDataFormatException("unexpected end", utf8.position());
308 }
309 }
310 }
311
312 /**
313 * Convert the given String into a sequence of (pseudo-)utf8
314 * formatted bytes.
315 *
316 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
317 *
318 * @param s String to convert
319 * @return array containing sequence of (pseudo-)utf8 formatted bytes
320 */
321 public static byte[] toUTF8(String s) {
322 byte[] result = new byte[utfLength(s)];
323 int result_index = 0;
324 for (int i = 0, n = s.length(); i < n; ++i) {
325 char c = s.charAt(i);
326 // in all shifts below, c is an (unsigned) char,
327 // so either >>> or >> is ok
328 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
329 result[result_index++] = (byte) c;
330 } else if (c > 0x07FF) {
331 result[result_index++] = (byte) (0xe0 | (byte) (c >> 12));
332 result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6));
333 result[result_index++] = (byte) (0x80 | (c & 0x3f));
334 } else {
335 result[result_index++] = (byte) (0xc0 | (byte) (c >> 6));
336 result[result_index++] = (byte) (0x80 | (c & 0x3f));
337 }
338 }
339 return result;
340 }
341
342 /**
343 * Convert the given String into a sequence of (pseudo-)utf8
344 * formatted bytes.
345 *
346 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
347 *
348 * @param s String to convert
349 * @param b Byte buffer to hold result
350 */
351 @Inline
352 public static void toUTF8(String s, ByteBuffer b) {
353 int result_index = 0;
354 for (int i = 0, n = s.length(); i < n; ++i) {
355 char c = s.charAt(i);
356 // in all shifts below, c is an (unsigned) char,
357 // so either >>> or >> is ok
358 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
359 b.put((byte) c);
360 } else if (c > 0x07FF) {
361 b.put((byte) (0xe0 | (byte) (c >> 12)));
362 b.put((byte) (0x80 | ((c & 0xfc0) >> 6)));
363 b.put((byte) (0x80 | (c & 0x3f)));
364 } else {
365 b.put((byte) (0xc0 | (byte) (c >> 6)));
366 b.put((byte) (0x80 | (c & 0x3f)));
367 }
368 }
369 }
370
371 /**
372 * Returns the length of a string's UTF encoded form.
373 */
374 @Pure
375 public static int utfLength(String s) {
376 int utflen = 0;
377 for (int i = 0, n = s.length(); i < n; ++i) {
378 int c = s.charAt(i);
379 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
380 ++utflen;
381 } else if (c > 0x07FF) {
382 utflen += 3;
383 } else {
384 utflen += 2;
385 }
386 }
387 return utflen;
388 }
389
390 /**
391 * Check whether the given sequence of bytes is valid (pseudo-)utf8.
392 *
393 * @param bytes byte array to check
394 * @return true iff the given sequence is valid (pseudo-)utf8.
395 */
396 public static boolean check(byte[] bytes) {
397 for (int i = 0, n = bytes.length; i < n;) {
398 byte b = bytes[i++];
399 if (!ALLOW_NORMAL_UTF8) {
400 if (b == 0) return false;
401 }
402 if (b >= 0) { // < 0x80 unsigned
403 // in the range '\001' to '\177'
404 continue;
405 }
406 try {
407 byte nb = bytes[i++];
408 if (b < -32) { // < 0xe0 unsigned
409 // '\000' or in the range '\200' to '\u07FF'
410 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
411 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
412 return false;
413 }
414 if (c < '\200') {
415 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
416 return false;
417 }
418 } else if (c > '\u07FF') {
419 return false;
420 }
421 } else {
422 byte nnb = bytes[i++];
423 // in the range '\u0800' to '\uFFFF'
424 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
425 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
426 return false;
427 }
428 if (c < '\u0800') {
429 return false;
430 }
431 }
432 } catch (ArrayIndexOutOfBoundsException e) {
433 return false;
434 }
435 }
436 return true;
437 }
438 }