001/*
002 *  This file is part of the Jikes RVM project (http://jikesrvm.org).
003 *
004 *  This file is licensed to You under the Eclipse Public License (EPL);
005 *  You may not use this file except in compliance with the License. You
006 *  may obtain a copy of the License at
007 *
008 *      http://www.opensource.org/licenses/eclipse-1.0.php
009 *
010 *  See the COPYRIGHT.txt file distributed with this work for information
011 *  regarding copyright ownership.
012 */
013package org.jikesrvm.classloader;
014
015import java.io.UTFDataFormatException;
016import java.nio.ByteBuffer;
017import org.vmmagic.pragma.Pure;
018import org.jikesrvm.VM;
019import org.vmmagic.pragma.Inline;
020import org.vmmagic.pragma.NoInline;
021
022/**
023 * Abstract class that contains conversion routines to/from utf8
024 * and/or pseudo-utf8.  It does not support utf8 encodings of
025 * more than 3 bytes.
026 * <p>
027 * The difference between utf8 and pseudo-utf8 is the special
028 * treatment of null.  In utf8, null is encoded as a single byte
029 * directly, whereas in pseudo-utf8, it is encoded as a two-byte
030 * sequence.  See the JVM specification for more information.
031 */
032public abstract class UTF8Convert {
033
034  /**
035   * Strictly check the format of the utf8/pseudo-utf8 byte array in
036   * fromUTF8.
037   */
038  static final boolean STRICTLY_CHECK_FORMAT = false;
039  /**
040   * Set fromUTF8 to not throw an exception when given a normal utf8
041   * byte array.
042   */
043  static final boolean ALLOW_NORMAL_UTF8 = false;
044  /**
045   * Set fromUTF8 to not throw an exception when given a pseudo utf8
046   * byte array.
047   */
048  static final boolean ALLOW_PSEUDO_UTF8 = true;
049  /**
050   * Set toUTF8 to write in pseudo-utf8 (rather than normal utf8).
051   */
052  static final boolean WRITE_PSEUDO_UTF8 = true;
053
054  /**
055   * UTF8 character visitor abstraction
056   */
057  private abstract static class UTF8CharacterVisitor {
058    abstract void visit_char(char c);
059  }
060
061  /**
062   * Visitor that builds up a char[] as characters are decoded
063   */
064  private static final class ByteArrayStringEncoderVisitor extends UTF8CharacterVisitor {
065    final char[] result;
066    int index;
067    ByteArrayStringEncoderVisitor(int length) {
068      result = new char[length];
069      index = 0;
070    }
071    @Override
072    void visit_char(char c) {
073      result[index] = c;
074      index++;
075    }
076    @Override
077    public String toString() {
078      if (VM.runningVM) {
079        return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
080      } else {
081        return new String(result, 0, index);
082      }
083    }
084  }
085
086  /**
087   * Visitor that builds up a char[] as characters are decoded
088   */
089  private static final class ByteBufferStringEncoderVisitor extends UTF8CharacterVisitor {
090    final char[] result;
091    int index;
092    ByteBufferStringEncoderVisitor(int length) {
093      result = new char[length];
094      index = 0;
095    }
096    @Override
097    void visit_char(char c) {
098      result[index] = c;
099      index++;
100    }
101    @Override
102    public String toString() {
103      if (VM.runningVM) {
104        return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
105      } else {
106        return new String(result, 0, index);
107      }
108    }
109  }
110
111  /**
112   * Visitor that builds up a String.hashCode form hashCode as characters are decoded
113   */
114  private static final class StringHashCodeVisitor extends UTF8CharacterVisitor {
115    int result = 0;
116    @Override
117    void visit_char(char c) {
118      result = result * 31 + c;
119    }
120    int getResult() {
121      return result;
122    }
123  }
124
125  /**
126   * Convert the given sequence of (pseudo-)utf8 formatted bytes
127   * into a String.<p>
128   *
129   * The acceptable input formats are controlled by the
130   * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
131   * flags.
132   *
133   * @param utf8 (pseudo-)utf8 byte array
134   * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
135   * @return unicode string
136   */
137  public static String fromUTF8(byte[] utf8) throws UTFDataFormatException {
138    UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length);
139    visitUTF8(utf8, visitor);
140    return visitor.toString();
141  }
142
143  /**
144   * Convert the given sequence of (pseudo-)utf8 formatted bytes
145   * into a String.
146   *
147   * The acceptable input formats are controlled by the
148   * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
149   * flags.<p>
150   *
151   * @param utf8 (pseudo-)utf8 byte array
152   * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
153   * @return unicode string
154   */
155  public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException {
156    UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining());
157    visitUTF8(utf8, visitor);
158    return visitor.toString();
159  }
160
161  /**
162   * Convert the given sequence of (pseudo-)utf8 formatted bytes
163   * into a String hashCode.<p>
164   *
165   * The acceptable input formats are controlled by the
166   * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
167   * flags.
168   *
169   * @param utf8 (pseudo-)utf8 byte array
170   * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
171   * @return hashCode corresponding to if this were a String.hashCode
172   */
173  public static int computeStringHashCode(byte[] utf8) throws UTFDataFormatException {
174    StringHashCodeVisitor visitor = new StringHashCodeVisitor();
175    visitUTF8(utf8, visitor);
176    return visitor.getResult();
177  }
178
179  @NoInline
180  private static void throwDataFormatException(String message, int location) throws UTFDataFormatException {
181    throw new UTFDataFormatException(message + " at location " + location);
182  }
183
184  /**
185   * Visit all bytes of the given utf8 string calling the visitor when a
186   * character is decoded.<p>
187   *
188   * The acceptable input formats are controlled by the
189   * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
190   * flags.
191   *
192   * @param utf8 (pseudo-)utf8 byte array
193   * @param visitor called when characters are decoded
194   * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
195   */
196  @Inline
197  private static void visitUTF8(byte[] utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
198    for (int i = 0, n = utf8.length; i < n;) {
199      byte b = utf8[i++];
200      if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
201        if (b == 0) {
202          throwDataFormatException("0 byte encountered", i - 1);
203        }
204      }
205      if (b >= 0) {  // < 0x80 unsigned
206        // in the range '\001' to '\177'
207        visitor.visit_char((char) b);
208        continue;
209      }
210      try {
211        byte nb = utf8[i++];
212        if (b < -32) {  // < 0xe0 unsigned
213          // '\000' or in the range '\200' to '\u07FF'
214          char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
215          visitor.visit_char(c);
216          if (STRICTLY_CHECK_FORMAT) {
217            if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
218              throwDataFormatException("invalid marker bits for double byte char" , i - 2);
219            }
220            if (c < '\200') {
221              if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
222                throwDataFormatException("encountered double byte char that should have been single byte", i - 2);
223              }
224            } else if (c > '\u07FF') {
225              throwDataFormatException("encountered double byte char that should have been single byte", i - 2);
226            }
227          }
228        } else {
229          byte nnb = utf8[i++];
230          // in the range '\u0800' to '\uFFFF'
231          char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
232          visitor.visit_char(c);
233          if (STRICTLY_CHECK_FORMAT) {
234            if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
235              throwDataFormatException("invalid marker bits for triple byte char", i - 3);
236            }
237            if (c < '\u0800') {
238              throwDataFormatException("encountered triple byte char that should have been fewer bytes", i - 3);
239            }
240          }
241        }
242      } catch (ArrayIndexOutOfBoundsException e) {
243        throwDataFormatException("unexpected end", i);
244      }
245    }
246  }
247
248  /**
249   * Visit all bytes of the given utf8 string calling the visitor when a
250   * character is decoded.<p>
251   *
252   * The acceptable input formats are controlled by the
253   * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
254   * flags.
255   *
256   * @param utf8 (pseudo-)utf8 byte array
257   * @param visitor called when characters are decoded
258   * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
259   */
260  @Inline
261  private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
262    while (utf8.hasRemaining()) {
263      byte b = utf8.get();
264      if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
265        if (b == 0) {
266          throwDataFormatException("0 byte encountered", utf8.position() - 1);
267        }
268      }
269      if (b >= 0) {  // < 0x80 unsigned
270        // in the range '\001' to '\177'
271        visitor.visit_char((char) b);
272        continue;
273      }
274      try {
275        byte nb = utf8.get();
276        if (b < -32) {  // < 0xe0 unsigned
277          // '\000' or in the range '\200' to '\u07FF'
278          char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
279          visitor.visit_char(c);
280          if (STRICTLY_CHECK_FORMAT) {
281            if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
282              throwDataFormatException("invalid marker bits for double byte char", utf8.position() - 2);
283            }
284            if (c < '\200') {
285              if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
286                throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
287              }
288            } else if (c > '\u07FF') {
289              throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
290            }
291          }
292        } else {
293          byte nnb = utf8.get();
294          // in the range '\u0800' to '\uFFFF'
295          char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
296          visitor.visit_char(c);
297          if (STRICTLY_CHECK_FORMAT) {
298            if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
299              throwDataFormatException("invalid marker bits for triple byte char", utf8.position() - 3);
300            }
301            if (c < '\u0800') {
302              throwDataFormatException("encountered triple byte char that should have been fewer bytes", utf8.position() - 3);
303            }
304          }
305        }
306      } catch (ArrayIndexOutOfBoundsException e) {
307        throwDataFormatException("unexpected end", utf8.position());
308      }
309    }
310  }
311
312  /**
313   * Convert the given String into a sequence of (pseudo-)utf8
314   * formatted bytes.<p>
315   *
316   * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
317   *
318   * @param s String to convert
319   * @return array containing sequence of (pseudo-)utf8 formatted bytes
320   */
321  public static byte[] toUTF8(String s) {
322    byte[] result = new byte[utfLength(s)];
323    int result_index = 0;
324    for (int i = 0, n = s.length(); i < n; ++i) {
325      char c = s.charAt(i);
326      // in all shifts below, c is an (unsigned) char,
327      // so either >>> or >> is ok
328      if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
329        result[result_index++] = (byte) c;
330      } else if (c > 0x07FF) {
331        result[result_index++] = (byte) (0xe0 | (byte) (c >> 12));
332        result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6));
333        result[result_index++] = (byte) (0x80 | (c & 0x3f));
334      } else {
335        result[result_index++] = (byte) (0xc0 | (byte) (c >> 6));
336        result[result_index++] = (byte) (0x80 | (c & 0x3f));
337      }
338    }
339    return result;
340  }
341
342  /**
343   * Convert the given String into a sequence of (pseudo-)utf8
344   * formatted bytes.<p>
345   *
346   * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
347   *
348   * @param s String to convert
349   * @param b Byte buffer to hold result
350   */
351  @Inline
352  public static void toUTF8(String s, ByteBuffer b) {
353    int result_index = 0;
354    for (int i = 0, n = s.length(); i < n; ++i) {
355      char c = s.charAt(i);
356      // in all shifts below, c is an (unsigned) char,
357      // so either >>> or >> is ok
358      if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
359        b.put((byte) c);
360      } else if (c > 0x07FF) {
361        b.put((byte) (0xe0 | (byte) (c >> 12)));
362        b.put((byte) (0x80 | ((c & 0xfc0) >> 6)));
363        b.put((byte) (0x80 | (c & 0x3f)));
364      } else {
365        b.put((byte) (0xc0 | (byte) (c >> 6)));
366        b.put((byte) (0x80 | (c & 0x3f)));
367      }
368    }
369  }
370
371  @Pure
372  public static int utfLength(String s) {
373    int utflen = 0;
374    for (int i = 0, n = s.length(); i < n; ++i) {
375      int c = s.charAt(i);
376      if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
377        ++utflen;
378      } else if (c > 0x07FF) {
379        utflen += 3;
380      } else {
381        utflen += 2;
382      }
383    }
384    return utflen;
385  }
386
387  /**
388   * Check whether the given sequence of bytes is valid (pseudo-)utf8.
389   *
390   * @param bytes byte array to check
391   * @return {@code true} iff the given sequence is valid (pseudo-)utf8.
392   */
393  public static boolean check(byte[] bytes) {
394    for (int i = 0, n = bytes.length; i < n;) {
395      byte b = bytes[i++];
396      if (!ALLOW_NORMAL_UTF8) {
397        if (b == 0) return false;
398      }
399      if (b >= 0) {  // < 0x80 unsigned
400        // in the range '\001' to '\177'
401        continue;
402      }
403      try {
404        byte nb = bytes[i++];
405        if (b < -32) {  // < 0xe0 unsigned
406          // '\000' or in the range '\200' to '\u07FF'
407          char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
408          if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
409            return false;
410          }
411          if (c < '\200') {
412            if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
413              return false;
414            }
415          } else if (c > '\u07FF') {
416            return false;
417          }
418        } else {
419          byte nnb = bytes[i++];
420          // in the range '\u0800' to '\uFFFF'
421          char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
422          if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
423            return false;
424          }
425          if (c < '\u0800') {
426            return false;
427          }
428        }
429      } catch (ArrayIndexOutOfBoundsException e) {
430        return false;
431      }
432    }
433    return true;
434  }
435}