001    /*
002     *  This file is part of the Jikes RVM project (http://jikesrvm.org).
003     *
004     *  This file is licensed to You under the Eclipse Public License (EPL);
005     *  You may not use this file except in compliance with the License. You
006     *  may obtain a copy of the License at
007     *
008     *      http://www.opensource.org/licenses/eclipse-1.0.php
009     *
010     *  See the COPYRIGHT.txt file distributed with this work for information
011     *  regarding copyright ownership.
012     */
013    package org.jikesrvm.classloader;
014    
015    import java.io.UTFDataFormatException;
016    import java.nio.ByteBuffer;
017    import org.vmmagic.pragma.Pure;
018    import org.jikesrvm.VM;
019    import org.vmmagic.pragma.Inline;
020    import org.vmmagic.pragma.NoInline;
021    
022    /**
023     * UTF8Convert
024     *
025     * Abstract class that contains conversion routines to/from utf8
026     * and/or pseudo-utf8.  It does not support utf8 encodings of
027     * more than 3 bytes.
028     *
029     * The difference between utf8 and pseudo-utf8 is the special
030     * treatment of null.  In utf8, null is encoded as a single byte
031     * directly, whereas in pseudo-utf8, it is encoded as a two-byte
032     * sequence.  See the JVM spec for more information.
033     */
034    public abstract class UTF8Convert {
035    
036      /**
037       * Strictly check the format of the utf8/pseudo-utf8 byte array in
038       * fromUTF8.
039       */
040      static final boolean STRICTLY_CHECK_FORMAT = false;
041      /**
042       * Set fromUTF8 to not throw an exception when given a normal utf8
043       * byte array.
044       */
045      static final boolean ALLOW_NORMAL_UTF8 = false;
046      /**
047       * Set fromUTF8 to not throw an exception when given a pseudo utf8
048       * byte array.
049       */
050      static final boolean ALLOW_PSEUDO_UTF8 = true;
051      /**
052       * Set toUTF8 to write in pseudo-utf8 (rather than normal utf8).
053       */
054      static final boolean WRITE_PSEUDO_UTF8 = true;
055    
056      /**
057       * UTF8 character visitor abstraction
058       */
059      private abstract static class UTF8CharacterVisitor {
060        abstract void visit_char(char c);
061      }
062    
063      /**
064       * Visitor that builds up a char[] as characters are decoded
065       */
066      private static final class ByteArrayStringEncoderVisitor extends UTF8CharacterVisitor {
067        final char[] result;
068        int index;
069        ByteArrayStringEncoderVisitor(int length) {
070          result = new char[length];
071          index = 0;
072        }
073        void visit_char(char c) {
074          result[index] = c;
075          index++;
076        }
077        public String toString() {
078          if (VM.runningVM) {
079            return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
080          } else {
081            return new String(result, 0, index);
082          }
083        }
084      }
085    
086      /**
087       * Visitor that builds up a char[] as characters are decoded
088       */
089      private static final class ByteBufferStringEncoderVisitor extends UTF8CharacterVisitor {
090        final char[] result;
091        int index;
092        ByteBufferStringEncoderVisitor(int length) {
093          result = new char[length];
094          index = 0;
095        }
096        void visit_char(char c) {
097          result[index] = c;
098          index++;
099        }
100        public String toString() {
101          if (VM.runningVM) {
102            return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
103          } else {
104            return new String(result, 0, index);
105          }
106        }
107      }
108    
109      /**
110       * Visitor that builds up a String.hashCode form hashCode as characters are decoded
111       */
112      private static final class StringHashCodeVisitor extends UTF8CharacterVisitor {
113        int result = 0;
114        void visit_char(char c) {
115          result = result * 31 + c;
116        }
117        int getResult() {
118          return result;
119        }
120      }
121    
122      /**
123       * Convert the given sequence of (pseudo-)utf8 formatted bytes
124       * into a String.
125       *
126       * The acceptable input formats are controlled by the
127       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
128       * flags.
129       *
130       * @param utf8 (pseudo-)utf8 byte array
131       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
132       * @return unicode string
133       */
134      public static String fromUTF8(byte[] utf8) throws UTFDataFormatException {
135        UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length);
136        visitUTF8(utf8, visitor);
137        return visitor.toString();
138      }
139    
140      /**
141       * Convert the given sequence of (pseudo-)utf8 formatted bytes
142       * into a String.
143       *
144       * The acceptable input formats are controlled by the
145       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
146       * flags.
147       *
148       * @param utf8 (pseudo-)utf8 byte array
149       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
150       * @return unicode string
151       */
152      public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException {
153        UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining());
154        visitUTF8(utf8, visitor);
155        return visitor.toString();
156      }
157    
158      /**
159       * Convert the given sequence of (pseudo-)utf8 formatted bytes
160       * into a String hashCode.
161       *
162       * The acceptable input formats are controlled by the
163       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
164       * flags.
165       *
166       * @param utf8 (pseudo-)utf8 byte array
167       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
168       * @return hashCode corresponding to if this were a String.hashCode
169       */
170      public static int computeStringHashCode(byte[] utf8) throws UTFDataFormatException {
171        StringHashCodeVisitor visitor = new StringHashCodeVisitor();
172        visitUTF8(utf8, visitor);
173        return visitor.getResult();
174      }
175    
176      /**
177       * Generate exception messages without bloating code
178       */
179      @NoInline
180      private static void throwDataFormatException(String message, int location) throws UTFDataFormatException {
181        throw new UTFDataFormatException(message + " at location " + location);
182      }
183    
184      /**
185       * Visit all bytes of the given utf8 string calling the visitor when a
186       * character is decoded.
187       *
188       * The acceptable input formats are controlled by the
189       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
190       * flags.
191       *
192       * @param utf8 (pseudo-)utf8 byte array
193       * @param visitor called when characters are decoded
194       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
195       */
196      @Inline
197      private static void visitUTF8(byte[] utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
198        for (int i = 0, n = utf8.length; i < n;) {
199          byte b = utf8[i++];
200          if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
201            if (b == 0) {
202              throwDataFormatException("0 byte encountered", i-1);
203            }
204          }
205          if (b >= 0) {  // < 0x80 unsigned
206            // in the range '\001' to '\177'
207            visitor.visit_char((char) b);
208            continue;
209          }
210          try {
211            byte nb = utf8[i++];
212            if (b < -32) {  // < 0xe0 unsigned
213              // '\000' or in the range '\200' to '\u07FF'
214              char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
215              visitor.visit_char(c);
216              if (STRICTLY_CHECK_FORMAT) {
217                if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
218                  throwDataFormatException("invalid marker bits for double byte char" , i-2);
219                }
220                if (c < '\200') {
221                  if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
222                    throwDataFormatException("encountered double byte char that should have been single byte", i-2);
223                  }
224                } else if (c > '\u07FF') {
225                  throwDataFormatException("encountered double byte char that should have been single byte", i-2);
226                }
227              }
228            } else {
229              byte nnb = utf8[i++];
230              // in the range '\u0800' to '\uFFFF'
231              char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
232              visitor.visit_char(c);
233              if (STRICTLY_CHECK_FORMAT) {
234                if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
235                  throwDataFormatException("invalid marker bits for triple byte char", i - 3);
236                }
237                if (c < '\u0800') {
238                  throwDataFormatException("encountered triple byte char that should have been fewer bytes", i - 3);
239                }
240              }
241            }
242          } catch (ArrayIndexOutOfBoundsException e) {
243            throwDataFormatException("unexpected end", i);
244          }
245        }
246      }
247    
248      /**
249       * Visit all bytes of the given utf8 string calling the visitor when a
250       * character is decoded.
251       *
252       * The acceptable input formats are controlled by the
253       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
254       * flags.
255       *
256       * @param utf8 (pseudo-)utf8 byte array
257       * @param visitor called when characters are decoded
258       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
259       */
260      @Inline
261      private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
262        while (utf8.hasRemaining()) {
263          byte b = utf8.get();
264          if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
265            if (b == 0) {
266              throwDataFormatException("0 byte encountered", utf8.position() - 1);
267            }
268          }
269          if (b >= 0) {  // < 0x80 unsigned
270            // in the range '\001' to '\177'
271            visitor.visit_char((char) b);
272            continue;
273          }
274          try {
275            byte nb = utf8.get();
276            if (b < -32) {  // < 0xe0 unsigned
277              // '\000' or in the range '\200' to '\u07FF'
278              char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
279              visitor.visit_char(c);
280              if (STRICTLY_CHECK_FORMAT) {
281                if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
282                  throwDataFormatException("invalid marker bits for double byte char", utf8.position() - 2);
283                }
284                if (c < '\200') {
285                  if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
286                    throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
287                  }
288                } else if (c > '\u07FF') {
289                  throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
290                }
291              }
292            } else {
293              byte nnb = utf8.get();
294              // in the range '\u0800' to '\uFFFF'
295              char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
296              visitor.visit_char(c);
297              if (STRICTLY_CHECK_FORMAT) {
298                if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
299                  throwDataFormatException("invalid marker bits for triple byte char", utf8.position() - 3);
300                }
301                if (c < '\u0800') {
302                  throwDataFormatException("encountered triple byte char that should have been fewer bytes", utf8.position() - 3);
303                }
304              }
305            }
306          } catch (ArrayIndexOutOfBoundsException e) {
307            throwDataFormatException("unexpected end", utf8.position());
308          }
309        }
310      }
311    
312      /**
313       * Convert the given String into a sequence of (pseudo-)utf8
314       * formatted bytes.
315       *
316       * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
317       *
318       * @param s String to convert
319       * @return array containing sequence of (pseudo-)utf8 formatted bytes
320       */
321      public static byte[] toUTF8(String s) {
322        byte[] result = new byte[utfLength(s)];
323        int result_index = 0;
324        for (int i = 0, n = s.length(); i < n; ++i) {
325          char c = s.charAt(i);
326          // in all shifts below, c is an (unsigned) char,
327          // so either >>> or >> is ok
328          if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
329            result[result_index++] = (byte) c;
330          } else if (c > 0x07FF) {
331            result[result_index++] = (byte) (0xe0 | (byte) (c >> 12));
332            result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6));
333            result[result_index++] = (byte) (0x80 | (c & 0x3f));
334          } else {
335            result[result_index++] = (byte) (0xc0 | (byte) (c >> 6));
336            result[result_index++] = (byte) (0x80 | (c & 0x3f));
337          }
338        }
339        return result;
340      }
341    
342      /**
343       * Convert the given String into a sequence of (pseudo-)utf8
344       * formatted bytes.
345       *
346       * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
347       *
348       * @param s String to convert
349       * @param b Byte buffer to hold result
350       */
351      @Inline
352      public static void toUTF8(String s, ByteBuffer b) {
353        int result_index = 0;
354        for (int i = 0, n = s.length(); i < n; ++i) {
355          char c = s.charAt(i);
356          // in all shifts below, c is an (unsigned) char,
357          // so either >>> or >> is ok
358          if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
359            b.put((byte) c);
360          } else if (c > 0x07FF) {
361            b.put((byte) (0xe0 | (byte) (c >> 12)));
362            b.put((byte) (0x80 | ((c & 0xfc0) >> 6)));
363            b.put((byte) (0x80 | (c & 0x3f)));
364          } else {
365            b.put((byte) (0xc0 | (byte) (c >> 6)));
366            b.put((byte) (0x80 | (c & 0x3f)));
367          }
368        }
369      }
370    
371      /**
372       * Returns the length of a string's UTF encoded form.
373       */
374      @Pure
375      public static int utfLength(String s) {
376        int utflen = 0;
377        for (int i = 0, n = s.length(); i < n; ++i) {
378          int c = s.charAt(i);
379          if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
380            ++utflen;
381          } else if (c > 0x07FF) {
382            utflen += 3;
383          } else {
384            utflen += 2;
385          }
386        }
387        return utflen;
388      }
389    
390      /**
391       * Check whether the given sequence of bytes is valid (pseudo-)utf8.
392       *
393       * @param bytes byte array to check
394       * @return true iff the given sequence is valid (pseudo-)utf8.
395       */
396      public static boolean check(byte[] bytes) {
397        for (int i = 0, n = bytes.length; i < n;) {
398          byte b = bytes[i++];
399          if (!ALLOW_NORMAL_UTF8) {
400            if (b == 0) return false;
401          }
402          if (b >= 0) {  // < 0x80 unsigned
403            // in the range '\001' to '\177'
404            continue;
405          }
406          try {
407            byte nb = bytes[i++];
408            if (b < -32) {  // < 0xe0 unsigned
409              // '\000' or in the range '\200' to '\u07FF'
410              char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
411              if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
412                return false;
413              }
414              if (c < '\200') {
415                if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
416                  return false;
417                }
418              } else if (c > '\u07FF') {
419                return false;
420              }
421            } else {
422              byte nnb = bytes[i++];
423              // in the range '\u0800' to '\uFFFF'
424              char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
425              if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
426                return false;
427              }
428              if (c < '\u0800') {
429                return false;
430              }
431            }
432          } catch (ArrayIndexOutOfBoundsException e) {
433            return false;
434          }
435        }
436        return true;
437      }
438    }