1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
33 $Id: utf8.c 6228 2006-12-26 19:56:58Z twisti $
45 #include "mm/memory.h"
47 #if defined(ENABLE_THREADS)
48 # include "threads/native/lock.h"
50 # include "threads/none/lock.h"
53 #include "vm/builtin.h"
54 #include "vm/exceptions.h"
55 #include "vm/hashtable.h"
56 #include "vm/options.h"
57 #include "vm/statistics.h"
58 #include "vm/stringlocal.h"
62 /* global variables ***********************************************************/
64 /* hashsize must be power of 2 */
66 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
68 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
71 /* utf-symbols for pointer comparison of frequently used strings **************/
73 utf *utf_java_lang_Object;
75 utf *utf_java_lang_Class;
76 utf *utf_java_lang_ClassLoader;
77 utf *utf_java_lang_Cloneable;
78 utf *utf_java_lang_SecurityManager;
79 utf *utf_java_lang_String;
80 utf *utf_java_lang_System;
81 utf *utf_java_lang_ThreadGroup;
82 utf *utf_java_io_Serializable;
84 utf *utf_java_lang_Throwable;
85 utf *utf_java_lang_VMThrowable;
86 utf *utf_java_lang_Error;
87 utf *utf_java_lang_AbstractMethodError;
88 utf *utf_java_lang_LinkageError;
89 utf *utf_java_lang_NoClassDefFoundError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
93 utf *utf_java_lang_Exception;
94 utf *utf_java_lang_ClassCastException;
95 utf *utf_java_lang_ClassNotFoundException;
96 utf *utf_java_lang_IllegalArgumentException;
97 utf *utf_java_lang_IllegalMonitorStateException;
99 utf *utf_java_lang_NullPointerException;
101 utf* utf_java_lang_Void;
102 utf* utf_java_lang_Boolean;
103 utf* utf_java_lang_Byte;
104 utf* utf_java_lang_Character;
105 utf* utf_java_lang_Short;
106 utf* utf_java_lang_Integer;
107 utf* utf_java_lang_Long;
108 utf* utf_java_lang_Float;
109 utf* utf_java_lang_Double;
111 utf *utf_java_lang_StackTraceElement;
112 utf *utf_java_lang_reflect_Constructor;
113 utf *utf_java_lang_reflect_Field;
114 utf *utf_java_lang_reflect_Method;
115 utf *utf_java_util_Vector;
117 utf *utf_InnerClasses; /* InnerClasses */
118 utf *utf_ConstantValue; /* ConstantValue */
119 utf *utf_Code; /* Code */
120 utf *utf_Exceptions; /* Exceptions */
121 utf *utf_LineNumberTable; /* LineNumberTable */
122 utf *utf_SourceFile; /* SourceFile */
124 #if defined(ENABLE_JAVASE)
125 utf *utf_EnclosingMethod;
127 utf *utf_RuntimeVisibleAnnotations;
128 utf *utf_StackMapTable;
131 utf *utf_init; /* <init> */
132 utf *utf_clinit; /* <clinit> */
133 utf *utf_clone; /* clone */
134 utf *utf_finalize; /* finalize */
135 utf *utf_run; /* run */
140 utf *utf_removeThread;
145 utf *utf_fillInStackTrace;
146 utf *utf_getSystemClassLoader;
148 utf *utf_printStackTrace;
159 utf *utf_void__void; /* ()V */
160 utf *utf_boolean__void; /* (Z)V */
161 utf *utf_byte__void; /* (B)V */
162 utf *utf_char__void; /* (C)V */
163 utf *utf_short__void; /* (S)V */
164 utf *utf_int__void; /* (I)V */
165 utf *utf_long__void; /* (J)V */
166 utf *utf_float__void; /* (F)V */
167 utf *utf_double__void; /* (D)V */
169 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
170 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
171 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
172 utf *utf_java_lang_Object__java_lang_Object;
173 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
174 utf *utf_java_lang_String__java_lang_Class;
175 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
176 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
178 utf *utf_not_named_yet; /* special name for unnamed classes */
180 utf *array_packagename;
183 /* utf_init ********************************************************************
185 Initializes the utf8 subsystem.
187 *******************************************************************************/
191 /* create utf8 hashtable */
193 hashtable_utf = NEW(hashtable);
195 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
197 #if defined(ENABLE_STATISTICS)
199 count_utf_len += sizeof(utf*) * hashtable_utf->size;
202 /* create utf-symbols for pointer comparison of frequently used strings */
204 utf_java_lang_Object = utf_new_char("java/lang/Object");
206 utf_java_lang_Class = utf_new_char("java/lang/Class");
207 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
208 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
209 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
210 utf_java_lang_String = utf_new_char("java/lang/String");
211 utf_java_lang_System = utf_new_char("java/lang/System");
212 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
213 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
215 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
216 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
217 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
219 utf_java_lang_AbstractMethodError =
220 utf_new_char(string_java_lang_AbstractMethodError);
222 utf_java_lang_LinkageError =
223 utf_new_char(string_java_lang_LinkageError);
225 utf_java_lang_NoClassDefFoundError =
226 utf_new_char(string_java_lang_NoClassDefFoundError);
228 utf_java_lang_NoSuchMethodError =
229 utf_new_char(string_java_lang_NoSuchMethodError);
231 utf_java_lang_OutOfMemoryError =
232 utf_new_char(string_java_lang_OutOfMemoryError);
234 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
236 utf_java_lang_ClassCastException =
237 utf_new_char(string_java_lang_ClassCastException);
239 utf_java_lang_ClassNotFoundException =
240 utf_new_char(string_java_lang_ClassNotFoundException);
242 utf_java_lang_IllegalArgumentException =
243 utf_new_char(string_java_lang_IllegalArgumentException);
245 utf_java_lang_IllegalMonitorStateException =
246 utf_new_char(string_java_lang_IllegalMonitorStateException);
248 utf_java_lang_NullPointerException =
249 utf_new_char(string_java_lang_NullPointerException);
251 utf_java_lang_Void = utf_new_char("java/lang/Void");
252 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
253 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
254 utf_java_lang_Character = utf_new_char("java/lang/Character");
255 utf_java_lang_Short = utf_new_char("java/lang/Short");
256 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
257 utf_java_lang_Long = utf_new_char("java/lang/Long");
258 utf_java_lang_Float = utf_new_char("java/lang/Float");
259 utf_java_lang_Double = utf_new_char("java/lang/Double");
261 utf_java_lang_StackTraceElement =
262 utf_new_char("java/lang/StackTraceElement");
264 utf_java_lang_reflect_Constructor =
265 utf_new_char("java/lang/reflect/Constructor");
267 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
268 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
269 utf_java_util_Vector = utf_new_char("java/util/Vector");
271 utf_InnerClasses = utf_new_char("InnerClasses");
272 utf_ConstantValue = utf_new_char("ConstantValue");
273 utf_Code = utf_new_char("Code");
274 utf_Exceptions = utf_new_char("Exceptions");
275 utf_LineNumberTable = utf_new_char("LineNumberTable");
276 utf_SourceFile = utf_new_char("SourceFile");
278 #if defined(ENABLE_JAVASE)
279 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
280 utf_Signature = utf_new_char("Signature");
281 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
282 utf_StackMapTable = utf_new_char("StackMapTable");
285 utf_init = utf_new_char("<init>");
286 utf_clinit = utf_new_char("<clinit>");
287 utf_clone = utf_new_char("clone");
288 utf_finalize = utf_new_char("finalize");
289 utf_run = utf_new_char("run");
291 utf_add = utf_new_char("add");
292 utf_remove = utf_new_char("remove");
293 utf_addThread = utf_new_char("addThread");
294 utf_removeThread = utf_new_char("removeThread");
295 utf_put = utf_new_char("put");
296 utf_get = utf_new_char("get");
297 utf_value = utf_new_char("value");
299 utf_printStackTrace = utf_new_char("printStackTrace");
300 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
301 utf_loadClass = utf_new_char("loadClass");
302 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
304 utf_Z = utf_new_char("Z");
305 utf_B = utf_new_char("B");
306 utf_C = utf_new_char("C");
307 utf_S = utf_new_char("S");
308 utf_I = utf_new_char("I");
309 utf_J = utf_new_char("J");
310 utf_F = utf_new_char("F");
311 utf_D = utf_new_char("D");
313 utf_void__void = utf_new_char("()V");
314 utf_boolean__void = utf_new_char("(Z)V");
315 utf_byte__void = utf_new_char("(B)V");
316 utf_char__void = utf_new_char("(C)V");
317 utf_short__void = utf_new_char("(S)V");
318 utf_int__void = utf_new_char("(I)V");
319 utf_long__void = utf_new_char("(J)V");
320 utf_float__void = utf_new_char("(F)V");
321 utf_double__void = utf_new_char("(D)V");
322 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
323 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
325 utf_void__java_lang_ClassLoader =
326 utf_new_char("()Ljava/lang/ClassLoader;");
328 utf_java_lang_Object__java_lang_Object =
329 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
331 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
333 utf_java_lang_String__java_lang_Class =
334 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
336 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
337 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
339 utf_null = utf_new_char("null");
340 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
341 array_packagename = utf_new_char("\t<the array package>");
343 /* everything's ok */
349 /* utf_hashkey *****************************************************************
351 The hashkey is computed from the utf-text by using up to 8
352 characters. For utf-symbols longer than 15 characters 3 characters
353 are taken from the beginning and the end, 2 characters are taken
356 *******************************************************************************/
358 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
359 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
361 u4 utf_hashkey(const char *text, u4 length)
363 const char *start_pos = text; /* pointer to utf text */
367 case 0: /* empty string */
370 case 1: return fbs(0);
371 case 2: return fbs(0) ^ nbs(3);
372 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
373 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
374 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
375 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
376 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
377 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
384 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
393 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
402 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
414 return a ^ nbs(9) ^ nbs(10);
426 return a ^ nbs(9) ^ nbs(10);
437 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
448 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
450 default: /* 3 characters from beginning */
456 /* 2 characters from middle */
457 text = start_pos + (length / 2);
462 /* 3 characters from end */
463 text = start_pos + length - 4;
468 return a ^ nbs(10) ^ nbs(11);
472 /* utf_full_hashkey ************************************************************
474 This function computes a hash value using all bytes in the string.
476 The algorithm is the "One-at-a-time" algorithm as published
477 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
479 *******************************************************************************/
481 u4 utf_full_hashkey(const char *text, u4 length)
483 register const unsigned char *p = (const unsigned char *) text;
491 hash += (hash << 10);
495 hash ^= (hash >> 11);
496 hash += (hash << 15);
501 /* unicode_hashkey *************************************************************
503 Compute the hashkey of a unicode string.
505 *******************************************************************************/
507 u4 unicode_hashkey(u2 *text, u2 len)
509 return utf_hashkey((char *) text, len);
513 /* utf_new *********************************************************************
515 Creates a new utf-symbol, the text of the symbol is passed as a
516 u1-array. The function searches the utf-hashtable for a utf-symbol
517 with this text. On success the element returned, otherwise a new
518 hashtable element is created.
520 If the number of entries in the hashtable exceeds twice the size of
521 the hashtable slots a reorganization of the hashtable is done and
522 the utf symbols are copied to a new hashtable with doubled size.
524 *******************************************************************************/
526 utf *utf_new(const char *text, u2 length)
528 u4 key; /* hashkey computed from utf-text */
529 u4 slot; /* slot in hashtable */
530 utf *u; /* hashtable element */
533 LOCK_MONITOR_ENTER(hashtable_utf->header);
535 #if defined(ENABLE_STATISTICS)
540 key = utf_hashkey(text, length);
541 slot = key & (hashtable_utf->size - 1);
542 u = hashtable_utf->ptr[slot];
544 /* search external hash chain for utf-symbol */
547 if (u->blength == length) {
548 /* compare text of hashtable elements */
550 for (i = 0; i < length; i++)
551 if (text[i] != u->text[i])
554 #if defined(ENABLE_STATISTICS)
556 count_utf_new_found++;
559 /* symbol found in hashtable */
561 LOCK_MONITOR_EXIT(hashtable_utf->header);
567 u = u->hashlink; /* next element in external chain */
570 #if defined(ENABLE_STATISTICS)
572 count_utf_len += sizeof(utf) + length + 1;
575 /* location in hashtable found, create new utf element */
577 u->blength = length; /* length in bytes of utfstring */
578 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
579 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
581 memcpy(u->text, text, length); /* copy utf-text */
582 u->text[length] = '\0';
584 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
585 hashtable_utf->entries++; /* update number of entries */
587 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
589 /* reorganization of hashtable, average length of the external
590 chains is approx. 2 */
592 hashtable *newhash; /* the new hashtable */
598 /* create new hashtable, double the size */
600 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
602 #if defined(ENABLE_STATISTICS)
604 count_utf_len += sizeof(utf*) * hashtable_utf->size;
607 /* transfer elements to new hashtable */
609 for (i = 0; i < hashtable_utf->size; i++) {
610 u = hashtable_utf->ptr[i];
614 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
616 u->hashlink = (utf *) newhash->ptr[slot];
617 newhash->ptr[slot] = u;
619 /* follow link in external hash chain */
625 /* dispose old table */
627 hashtable_free(hashtable_utf);
629 hashtable_utf = newhash;
632 LOCK_MONITOR_EXIT(hashtable_utf->header);
638 /* utf_new_u2 ******************************************************************
640 Make utf symbol from u2 array, if isclassname is true '.' is
643 *******************************************************************************/
645 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
647 char *buffer; /* memory buffer for unicode characters */
648 char *pos; /* pointer to current position in buffer */
649 u4 left; /* unicode characters left */
650 u4 buflength; /* utf length in bytes of the u2 array */
651 utf *result; /* resulting utf-string */
654 /* determine utf length in bytes and allocate memory */
656 buflength = u2_utflength(unicode_pos, unicode_length);
657 buffer = MNEW(char, buflength);
662 for (i = 0; i++ < unicode_length; unicode_pos++) {
663 /* next unicode character */
666 if ((c != 0) && (c < 0x80)) {
669 if ((int) left < 0) break;
670 /* convert classname */
671 if (isclassname && c == '.')
676 } else if (c < 0x800) {
678 unsigned char high = c >> 6;
679 unsigned char low = c & 0x3F;
681 if ((int) left < 0) break;
682 *pos++ = high | 0xC0;
688 char mid = (c >> 6) & 0x3F;
691 if ((int) left < 0) break;
692 *pos++ = high | 0xE0;
698 /* insert utf-string into symbol-table */
699 result = utf_new(buffer,buflength);
701 MFREE(buffer, char, buflength);
707 /* utf_new_char ****************************************************************
709 Creates a new utf symbol, the text for this symbol is passed as a
710 c-string ( = char* ).
712 *******************************************************************************/
714 utf *utf_new_char(const char *text)
716 return utf_new(text, strlen(text));
720 /* utf_new_char_classname ******************************************************
722 Creates a new utf symbol, the text for this symbol is passed as a
723 c-string ( = char* ) "." characters are going to be replaced by
724 "/". Since the above function is used often, this is a separte
725 function, instead of an if.
727 *******************************************************************************/
729 utf *utf_new_char_classname(const char *text)
731 if (strchr(text, '.')) {
732 char *txt = strdup(text);
733 char *end = txt + strlen(txt);
737 for (c = txt; c < end; c++)
738 if (*c == '.') *c = '/';
740 tmpRes = utf_new(txt, strlen(txt));
746 return utf_new(text, strlen(text));
750 /* utf_nextu2 ******************************************************************
752 Read the next unicode character from the utf string and increment
753 the utf-string pointer accordingly.
755 CAUTION: This function is unsafe for input that was not checked
758 *******************************************************************************/
760 u2 utf_nextu2(char **utf_ptr)
762 /* uncompressed unicode character */
764 /* current position in utf text */
765 unsigned char *utf = (unsigned char *) (*utf_ptr);
766 /* bytes representing the unicode character */
767 unsigned char ch1, ch2, ch3;
768 /* number of bytes used to represent the unicode character */
771 switch ((ch1 = utf[0]) >> 4) {
772 default: /* 1 byte */
776 case 0xD: /* 2 bytes */
777 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
778 unsigned char high = ch1 & 0x1F;
779 unsigned char low = ch2 & 0x3F;
780 unicode_char = (high << 6) + low;
785 case 0xE: /* 2 or 3 bytes */
786 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
787 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
788 unsigned char low = ch3 & 0x3f;
789 unsigned char mid = ch2 & 0x3f;
790 unsigned char high = ch1 & 0x0f;
791 unicode_char = (((high << 6) + mid) << 6) + low;
799 /* update position in utf-text */
800 *utf_ptr = (char *) (utf + len);
806 /* utf_bytes *******************************************************************
808 Determine number of bytes (aka. octets) in the utf string.
811 u............utf string
814 The number of octets of this utf string.
815 There is _no_ terminating zero included in this count.
817 *******************************************************************************/
824 /* utf_get_number_of_u2s_for_buffer ********************************************
826 Determine number of UTF-16 u2s in the given UTF-8 buffer
828 CAUTION: This function is unsafe for input that was not checked
831 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
832 to an array of u2s (UTF-16) and want to know how many of them you will get.
833 All other uses of this function are probably wrong.
836 buffer........points to first char in buffer
837 blength.......number of _bytes_ in the buffer
840 the number of u2s needed to hold this string in UTF-16 encoding.
841 There is _no_ terminating zero included in this count.
843 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
846 *******************************************************************************/
848 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
850 const char *endpos; /* points behind utf string */
851 const char *utf_ptr; /* current position in utf text */
852 u4 len = 0; /* number of unicode characters */
855 endpos = utf_ptr + blength;
857 while (utf_ptr < endpos) {
859 /* next unicode character */
860 utf_nextu2((char **)&utf_ptr);
863 assert(utf_ptr == endpos);
869 /* utf_get_number_of_u2s *******************************************************
871 Determine number of UTF-16 u2s in the utf string.
873 CAUTION: This function is unsafe for input that was not checked
876 CAUTION: Use this function *only* when you want to convert a utf string
877 to an array of u2s and want to know how many of them you will get.
878 All other uses of this function are probably wrong.
881 u............utf string
884 the number of u2s needed to hold this string in UTF-16 encoding.
885 There is _no_ terminating zero included in this count.
886 XXX 0 if a NullPointerException has been thrown (see below)
888 *******************************************************************************/
890 u4 utf_get_number_of_u2s(utf *u)
892 char *endpos; /* points behind utf string */
893 char *utf_ptr; /* current position in utf text */
894 u4 len = 0; /* number of unicode characters */
896 /* XXX this is probably not checked by most callers! Review this after */
897 /* the invalid uses of this function have been eliminated */
899 exceptions_throw_nullpointerexception();
906 while (utf_ptr < endpos) {
908 /* next unicode character */
909 utf_nextu2(&utf_ptr);
912 if (utf_ptr != endpos)
913 /* string ended abruptly */
914 throw_cacao_exception_exit(string_java_lang_InternalError,
915 "Illegal utf8 string");
921 /* utf8_safe_number_of_u2s *****************************************************
923 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
924 (For invalid UTF-8 the U+fffd replacement character will be counted.)
926 This function is safe even for invalid UTF-8 strings.
929 text..........zero-terminated(!) UTF-8 string (may be invalid)
931 nbytes........strlen(text). (This is needed to completely emulate
935 the number of u2s needed to hold this string in UTF-16 encoding.
936 There is _no_ terminating zero included in this count.
938 *******************************************************************************/
940 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
941 register const unsigned char *t;
944 register const unsigned char *tlimit;
955 t = (const unsigned char *) text;
958 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
964 /* highest bit set, non-ASCII character */
966 if ((byte & 0xe0) == 0xc0) {
967 /* 2-byte: should be 110..... 10...... ? */
969 if ((*t++ & 0xc0) == 0x80)
974 else if ((byte & 0xf0) == 0xe0) {
975 /* 3-byte: should be 1110.... 10...... 10...... */
979 return len + 1; /* invalid, stop here */
981 if ((*t++ & 0xc0) == 0x80) {
982 if ((*t++ & 0xc0) == 0x80)
990 else if ((byte & 0xf8) == 0xf0) {
991 /* 4-byte: should be 11110... 10...... 10...... 10...... */
995 return len + 1; /* invalid, stop here */
997 if (((byte1 = *t++) & 0xc0) == 0x80) {
998 if (((byte2 = *t++) & 0xc0) == 0x80) {
999 if (((byte3 = *t++) & 0xc0) == 0x80) {
1000 /* valid 4-byte UTF-8? */
1001 value = ((byte & 0x07) << 18)
1002 | ((byte1 & 0x3f) << 12)
1003 | ((byte2 & 0x3f) << 6)
1004 | ((byte3 & 0x3f) );
1006 if (value > 0x10FFFF)
1008 else if (value > 0xFFFF)
1009 len += 1; /* we need surrogates */
1011 ; /* 16bit suffice */
1022 else if ((byte & 0xfc) == 0xf8) {
1023 /* invalid 5-byte */
1025 return len + 1; /* invalid, stop here */
1028 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1031 else if ((byte & 0xfe) == 0xfc) {
1032 /* invalid 6-byte */
1034 return len + 1; /* invalid, stop here */
1037 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1049 /* ASCII character, common case */
1059 /* utf8_safe_convert_to_u2s ****************************************************
1061 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1062 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1063 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1065 This function is safe even for invalid UTF-8 strings.
1068 text..........zero-terminated(!) UTF-8 string (may be invalid)
1070 nbytes........strlen(text). (This is needed to completely emulate
1072 buffer........a preallocated array of u2s to receive the decoded
1073 string. Use utf8_safe_number_of_u2s to get the
1074 required number of u2s for allocating this.
1076 *******************************************************************************/
1078 #define UNICODE_REPLACEMENT 0xfffd
1080 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1081 register const unsigned char *t;
1083 register const unsigned char *tlimit;
1091 assert(nbytes >= 0);
1093 t = (const unsigned char *) text;
1094 tlimit = t + nbytes;
1096 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1102 /* highest bit set, non-ASCII character */
1104 if ((byte & 0xe0) == 0xc0) {
1105 /* 2-byte: should be 110..... 10...... */
1107 if (((byte1 = *t++) & 0xc0) == 0x80) {
1108 /* valid 2-byte UTF-8 */
1109 *buffer++ = ((byte & 0x1f) << 6)
1110 | ((byte1 & 0x3f) );
1113 *buffer++ = UNICODE_REPLACEMENT;
1117 else if ((byte & 0xf0) == 0xe0) {
1118 /* 3-byte: should be 1110.... 10...... 10...... */
1120 if (t + 2 > tlimit) {
1121 *buffer++ = UNICODE_REPLACEMENT;
1125 if (((byte1 = *t++) & 0xc0) == 0x80) {
1126 if (((byte2 = *t++) & 0xc0) == 0x80) {
1127 /* valid 3-byte UTF-8 */
1128 *buffer++ = ((byte & 0x0f) << 12)
1129 | ((byte1 & 0x3f) << 6)
1130 | ((byte2 & 0x3f) );
1133 *buffer++ = UNICODE_REPLACEMENT;
1138 *buffer++ = UNICODE_REPLACEMENT;
1142 else if ((byte & 0xf8) == 0xf0) {
1143 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1145 if (t + 3 > tlimit) {
1146 *buffer++ = UNICODE_REPLACEMENT;
1150 if (((byte1 = *t++) & 0xc0) == 0x80) {
1151 if (((byte2 = *t++) & 0xc0) == 0x80) {
1152 if (((byte3 = *t++) & 0xc0) == 0x80) {
1153 /* valid 4-byte UTF-8? */
1154 value = ((byte & 0x07) << 18)
1155 | ((byte1 & 0x3f) << 12)
1156 | ((byte2 & 0x3f) << 6)
1157 | ((byte3 & 0x3f) );
1159 if (value > 0x10FFFF) {
1160 *buffer++ = UNICODE_REPLACEMENT;
1162 else if (value > 0xFFFF) {
1163 /* we need surrogates */
1164 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1165 *buffer++ = 0xdc00 | (value & 0x03ff);
1168 *buffer++ = value; /* 16bit suffice */
1171 *buffer++ = UNICODE_REPLACEMENT;
1176 *buffer++ = UNICODE_REPLACEMENT;
1181 *buffer++ = UNICODE_REPLACEMENT;
1185 else if ((byte & 0xfc) == 0xf8) {
1186 if (t + 4 > tlimit) {
1187 *buffer++ = UNICODE_REPLACEMENT;
1192 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1194 *buffer++ = UNICODE_REPLACEMENT;
1196 else if ((byte & 0xfe) == 0xfc) {
1197 if (t + 5 > tlimit) {
1198 *buffer++ = UNICODE_REPLACEMENT;
1203 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1205 *buffer++ = UNICODE_REPLACEMENT;
1208 *buffer++ = UNICODE_REPLACEMENT;
1216 /* ASCII character, common case */
1224 /* u2_utflength ****************************************************************
1226 Returns the utf length in bytes of a u2 array.
1228 *******************************************************************************/
1230 u4 u2_utflength(u2 *text, u4 u2_length)
1232 u4 result_len = 0; /* utf length in bytes */
1233 u2 ch; /* current unicode character */
1236 for (len = 0; len < u2_length; len++) {
1237 /* next unicode character */
1240 /* determine bytes required to store unicode character as utf */
1241 if (ch && (ch < 0x80))
1243 else if (ch < 0x800)
1253 /* utf_copy ********************************************************************
1255 Copy the given utf string byte-for-byte to a buffer.
1258 buffer.......the buffer
1259 u............the utf string
1261 *******************************************************************************/
1263 void utf_copy(char *buffer, utf *u)
1265 /* our utf strings are zero-terminated (done by utf_new) */
1266 MCOPY(buffer, u->text, char, u->blength + 1);
1270 /* utf_cat *********************************************************************
1272 Append the given utf string byte-for-byte to a buffer.
1275 buffer.......the buffer
1276 u............the utf string
1278 *******************************************************************************/
1280 void utf_cat(char *buffer, utf *u)
1282 /* our utf strings are zero-terminated (done by utf_new) */
1283 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1287 /* utf_copy_classname **********************************************************
1289 Copy the given utf classname byte-for-byte to a buffer.
1290 '/' is replaced by '.'
1293 buffer.......the buffer
1294 u............the utf string
1296 *******************************************************************************/
1298 void utf_copy_classname(char *buffer, utf *u)
1307 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1309 while (srcptr != endptr) {
1318 /* utf_cat *********************************************************************
1320 Append the given utf classname byte-for-byte to a buffer.
1321 '/' is replaced by '.'
1324 buffer.......the buffer
1325 u............the utf string
1327 *******************************************************************************/
1329 void utf_cat_classname(char *buffer, utf *u)
1331 utf_copy_classname(buffer + strlen(buffer), u);
1334 /* utf_display_printable_ascii *************************************************
1336 Write utf symbol to stdout (for debugging purposes).
1337 Non-printable and non-ASCII characters are printed as '?'.
1339 *******************************************************************************/
1341 void utf_display_printable_ascii(utf *u)
1343 char *endpos; /* points behind utf string */
1344 char *utf_ptr; /* current position in utf text */
1352 endpos = UTF_END(u);
1355 while (utf_ptr < endpos) {
1356 /* read next unicode character */
1358 u2 c = utf_nextu2(&utf_ptr);
1360 if ((c >= 32) && (c <= 127))
1370 /* utf_display_printable_ascii_classname ***************************************
1372 Write utf symbol to stdout with `/' converted to `.' (for debugging
1374 Non-printable and non-ASCII characters are printed as '?'.
1376 *******************************************************************************/
1378 void utf_display_printable_ascii_classname(utf *u)
1380 char *endpos; /* points behind utf string */
1381 char *utf_ptr; /* current position in utf text */
1389 endpos = UTF_END(u);
1392 while (utf_ptr < endpos) {
1393 /* read next unicode character */
1395 u2 c = utf_nextu2(&utf_ptr);
1400 if ((c >= 32) && (c <= 127))
1410 /* utf_sprint_convert_to_latin1 ************************************************
1412 Write utf symbol into c-string (for debugging purposes).
1413 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1416 *******************************************************************************/
1418 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1420 char *endpos; /* points behind utf string */
1421 char *utf_ptr; /* current position in utf text */
1422 u2 pos = 0; /* position in c-string */
1425 strcpy(buffer, "NULL");
1429 endpos = UTF_END(u);
1432 while (utf_ptr < endpos)
1433 /* copy next unicode character */
1434 buffer[pos++] = utf_nextu2(&utf_ptr);
1436 /* terminate string */
1441 /* utf_sprint_convert_to_latin1_classname **************************************
1443 Write utf symbol into c-string with `/' converted to `.' (for debugging
1445 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1448 *******************************************************************************/
1450 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1452 char *endpos; /* points behind utf string */
1453 char *utf_ptr; /* current position in utf text */
1454 u2 pos = 0; /* position in c-string */
1457 strcpy(buffer, "NULL");
1461 endpos = UTF_END(u);
1464 while (utf_ptr < endpos) {
1465 /* copy next unicode character */
1466 u2 c = utf_nextu2(&utf_ptr);
1467 if (c == '/') c = '.';
1471 /* terminate string */
1476 /* utf_strcat_convert_to_latin1 ************************************************
1478 Like libc strcat, but uses an utf8 string.
1479 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1482 *******************************************************************************/
1484 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1486 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1490 /* utf_strcat_convert_to_latin1_classname **************************************
1492 Like libc strcat, but uses an utf8 string.
1493 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1496 *******************************************************************************/
1498 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1500 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1504 /* utf_fprint_printable_ascii **************************************************
1506 Write utf symbol into file.
1507 Non-printable and non-ASCII characters are printed as '?'.
1509 *******************************************************************************/
1511 void utf_fprint_printable_ascii(FILE *file, utf *u)
1513 char *endpos; /* points behind utf string */
1514 char *utf_ptr; /* current position in utf text */
1519 endpos = UTF_END(u);
1522 while (utf_ptr < endpos) {
1523 /* read next unicode character */
1524 u2 c = utf_nextu2(&utf_ptr);
1526 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1527 else fprintf(file, "?");
1532 /* utf_fprint_printable_ascii_classname ****************************************
1534 Write utf symbol into file with `/' converted to `.'.
1535 Non-printable and non-ASCII characters are printed as '?'.
1537 *******************************************************************************/
1539 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1541 char *endpos; /* points behind utf string */
1542 char *utf_ptr; /* current position in utf text */
1547 endpos = UTF_END(u);
1550 while (utf_ptr < endpos) {
1551 /* read next unicode character */
1552 u2 c = utf_nextu2(&utf_ptr);
1553 if (c == '/') c = '.';
1555 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1556 else fprintf(file, "?");
1561 /* is_valid_utf ****************************************************************
1563 Return true if the given string is a valid UTF-8 string.
1565 utf_ptr...points to first character
1566 end_pos...points after last character
1568 *******************************************************************************/
1570 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1572 bool is_valid_utf(char *utf_ptr, char *end_pos)
1579 if (end_pos < utf_ptr) return false;
1580 bytes = end_pos - utf_ptr;
1584 if (!c) return false; /* 0x00 is not allowed */
1585 if ((c & 0x80) == 0) continue; /* ASCII */
1587 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1588 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1589 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1590 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1591 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1592 else return false; /* invalid leading byte */
1594 if (len > 2) return false; /* Java limitation */
1596 v = (unsigned long)c & (0x3f >> len);
1598 if ((bytes -= len) < 0) return false; /* missing bytes */
1600 for (i = len; i--; ) {
1602 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1604 v = (v << 6) | (c & 0x3f);
1608 if (len != 1) return false; /* Java special */
1611 /* Sun Java seems to allow overlong UTF-8 encodings */
1613 /* if (v < min_codepoint[len]) */
1614 /* XXX throw exception? */
1617 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1618 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1620 /* even these seem to be allowed */
1621 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1628 /* is_valid_name ***************************************************************
1630 Return true if the given string may be used as a class/field/method
1631 name. (Currently this only disallows empty strings and control
1634 NOTE: The string is assumed to have passed is_valid_utf!
1636 utf_ptr...points to first character
1637 end_pos...points after last character
1639 *******************************************************************************/
1641 bool is_valid_name(char *utf_ptr, char *end_pos)
1643 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1645 while (utf_ptr < end_pos) {
1646 unsigned char c = *utf_ptr++;
1648 if (c < 0x20) return false; /* disallow control characters */
1649 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1656 bool is_valid_name_utf(utf *u)
1658 return is_valid_name(u->text, UTF_END(u));
1662 /* utf_show ********************************************************************
1664 Writes the utf symbols in the utfhash to stdout and displays the
1665 number of external hash chains grouped according to the chainlength
1666 (for debugging purposes).
1668 *******************************************************************************/
1670 #if !defined(NDEBUG)
1674 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1676 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1677 u4 max_chainlength = 0; /* maximum length of the chains */
1678 u4 sum_chainlength = 0; /* sum of the chainlengths */
1679 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1682 printf("UTF-HASH:\n");
1684 /* show element of utf-hashtable */
1686 for (i = 0; i < hashtable_utf->size; i++) {
1687 utf *u = hashtable_utf->ptr[i];
1690 printf("SLOT %d: ", (int) i);
1694 utf_display_printable_ascii(u);
1702 printf("UTF-HASH: %d slots for %d entries\n",
1703 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1705 if (hashtable_utf->entries == 0)
1708 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1710 for (i=0;i<CHAIN_LIMIT;i++)
1713 /* count numbers of hashchains according to their length */
1714 for (i=0; i<hashtable_utf->size; i++) {
1716 utf *u = (utf*) hashtable_utf->ptr[i];
1717 u4 chain_length = 0;
1719 /* determine chainlength */
1725 /* update sum of all chainlengths */
1726 sum_chainlength+=chain_length;
1728 /* determine the maximum length of the chains */
1729 if (chain_length>max_chainlength)
1730 max_chainlength = chain_length;
1732 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1733 if (chain_length>=CHAIN_LIMIT) {
1734 beyond_limit+=chain_length;
1735 chain_length=CHAIN_LIMIT-1;
1738 /* update number of hashchains of current length */
1739 chain_count[chain_length]++;
1742 /* display results */
1743 for (i=1;i<CHAIN_LIMIT-1;i++)
1744 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1746 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1749 printf("max. chainlength:%5d\n",max_chainlength);
1751 /* avg. chainlength = sum of chainlengths / number of chains */
1752 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1754 #endif /* !defined(NDEBUG) */
1758 * These are local overrides for various environment variables in Emacs.
1759 * Please do not remove this and leave it at the end of the file, where
1760 * Emacs will automagically detect them.
1761 * ---------------------------------------------------------------------
1764 * indent-tabs-mode: t
1768 * vim:noexpandtab:sw=4:ts=4: