1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 $Id: utf8.c 7716 2007-04-16 14:29:53Z twisti $
37 #include "mm/memory.h"
39 #if defined(ENABLE_THREADS)
40 # include "threads/native/lock.h"
42 # include "threads/none/lock.h"
45 #include "toolbox/hashtable.h"
47 #include "vm/exceptions.h"
49 #include "vmcore/options.h"
51 #if defined(ENABLE_STATISTICS)
52 # include "vmcore/statistics.h"
55 #include "vmcore/utf8.h"
58 /* global variables ***********************************************************/
60 /* hashsize must be power of 2 */
62 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
64 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
67 /* utf-symbols for pointer comparison of frequently used strings **************/
69 utf *utf_java_lang_Object;
71 utf *utf_java_lang_Class;
72 utf *utf_java_lang_ClassLoader;
73 utf *utf_java_lang_Cloneable;
74 utf *utf_java_lang_SecurityManager;
75 utf *utf_java_lang_String;
76 utf *utf_java_lang_System;
77 utf *utf_java_lang_ThreadGroup;
78 utf *utf_java_lang_ref_SoftReference;
79 utf *utf_java_lang_ref_WeakReference;
80 utf *utf_java_lang_ref_PhantomReference;
81 utf *utf_java_io_Serializable;
83 utf *utf_java_lang_Throwable;
84 utf *utf_java_lang_Error;
86 utf *utf_java_lang_AbstractMethodError;
87 utf *utf_java_lang_ClassCircularityError;
88 utf *utf_java_lang_ClassFormatError;
89 utf *utf_java_lang_ExceptionInInitializerError;
90 utf *utf_java_lang_IncompatibleClassChangeError;
91 utf *utf_java_lang_InstantiationError;
92 utf *utf_java_lang_InternalError;
93 utf *utf_java_lang_LinkageError;
94 utf *utf_java_lang_NoClassDefFoundError;
95 utf *utf_java_lang_NoSuchFieldError;
96 utf *utf_java_lang_NoSuchMethodError;
97 utf *utf_java_lang_OutOfMemoryError;
98 utf *utf_java_lang_UnsatisfiedLinkError;
99 utf *utf_java_lang_UnsupportedClassVersionError;
100 utf *utf_java_lang_VerifyError;
101 utf *utf_java_lang_VirtualMachineError;
103 #if defined(WITH_CLASSPATH_GNU)
104 utf *utf_java_lang_VMThrowable;
107 utf *utf_java_lang_Exception;
109 utf *utf_java_lang_ArithmeticException;
110 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
111 utf *utf_java_lang_ArrayStoreException;
112 utf *utf_java_lang_ClassCastException;
113 utf *utf_java_lang_ClassNotFoundException;
114 utf *utf_java_lang_CloneNotSupportedException;
115 utf *utf_java_lang_IllegalAccessException;
116 utf *utf_java_lang_IllegalArgumentException;
117 utf *utf_java_lang_IllegalMonitorStateException;
118 utf *utf_java_lang_InstantiationException;
119 utf *utf_java_lang_InterruptedException;
120 utf *utf_java_lang_NegativeArraySizeException;
121 utf *utf_java_lang_NullPointerException;
122 utf *utf_java_lang_StringIndexOutOfBoundsException;
124 utf *utf_java_lang_reflect_InvocationTargetException;
126 #if defined(ENABLE_JAVASE)
127 utf* utf_java_lang_Void;
130 utf* utf_java_lang_Boolean;
131 utf* utf_java_lang_Byte;
132 utf* utf_java_lang_Character;
133 utf* utf_java_lang_Short;
134 utf* utf_java_lang_Integer;
135 utf* utf_java_lang_Long;
136 utf* utf_java_lang_Float;
137 utf* utf_java_lang_Double;
139 #if defined(ENABLE_JAVASE)
140 utf *utf_java_lang_StackTraceElement;
141 utf *utf_java_lang_reflect_Constructor;
142 utf *utf_java_lang_reflect_Field;
143 utf *utf_java_lang_reflect_Method;
144 utf *utf_java_util_Vector;
147 utf *utf_InnerClasses; /* InnerClasses */
148 utf *utf_ConstantValue; /* ConstantValue */
149 utf *utf_Code; /* Code */
150 utf *utf_Exceptions; /* Exceptions */
151 utf *utf_LineNumberTable; /* LineNumberTable */
152 utf *utf_SourceFile; /* SourceFile */
154 #if defined(ENABLE_JAVASE)
155 utf *utf_EnclosingMethod;
157 utf *utf_RuntimeVisibleAnnotations;
158 utf *utf_StackMapTable;
161 utf *utf_init; /* <init> */
162 utf *utf_clinit; /* <clinit> */
163 utf *utf_clone; /* clone */
164 utf *utf_finalize; /* finalize */
165 utf *utf_run; /* run */
170 utf *utf_removeThread;
175 utf *utf_fillInStackTrace;
176 utf *utf_getSystemClassLoader;
178 utf *utf_printStackTrace;
189 utf *utf_void__void; /* ()V */
190 utf *utf_boolean__void; /* (Z)V */
191 utf *utf_byte__void; /* (B)V */
192 utf *utf_char__void; /* (C)V */
193 utf *utf_short__void; /* (S)V */
194 utf *utf_int__void; /* (I)V */
195 utf *utf_long__void; /* (J)V */
196 utf *utf_float__void; /* (F)V */
197 utf *utf_double__void; /* (D)V */
199 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
200 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
201 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
202 utf *utf_java_lang_Object__java_lang_Object;
203 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
204 utf *utf_java_lang_String__java_lang_Class;
205 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
206 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
208 utf *utf_not_named_yet; /* special name for unnamed classes */
210 utf *array_packagename;
213 /* utf_init ********************************************************************
215 Initializes the utf8 subsystem.
217 *******************************************************************************/
221 /* create utf8 hashtable */
223 hashtable_utf = NEW(hashtable);
225 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
227 #if defined(ENABLE_STATISTICS)
229 count_utf_len += sizeof(utf*) * hashtable_utf->size;
232 /* create utf-symbols for pointer comparison of frequently used strings */
234 utf_java_lang_Object = utf_new_char("java/lang/Object");
236 utf_java_lang_Class = utf_new_char("java/lang/Class");
237 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
238 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
239 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
240 utf_java_lang_String = utf_new_char("java/lang/String");
241 utf_java_lang_System = utf_new_char("java/lang/System");
242 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
244 utf_java_lang_ref_SoftReference =
245 utf_new_char("java/lang/ref/SoftReference");
247 utf_java_lang_ref_WeakReference =
248 utf_new_char("java/lang/ref/WeakReference");
250 utf_java_lang_ref_PhantomReference =
251 utf_new_char("java/lang/ref/PhantomReference");
253 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
255 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
256 utf_java_lang_Error = utf_new_char("java/lang/Error");
258 utf_java_lang_ClassCircularityError =
259 utf_new_char("java/lang/ClassCircularityError");
261 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
263 utf_java_lang_ExceptionInInitializerError =
264 utf_new_char("java/lang/ExceptionInInitializerError");
266 utf_java_lang_IncompatibleClassChangeError =
267 utf_new_char("java/lang/IncompatibleClassChangeError");
269 utf_java_lang_InstantiationError =
270 utf_new_char("java/lang/InstantiationError");
272 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
273 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
275 utf_java_lang_NoClassDefFoundError =
276 utf_new_char("java/lang/NoClassDefFoundError");
278 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
280 utf_java_lang_UnsatisfiedLinkError =
281 utf_new_char("java/lang/UnsatisfiedLinkError");
283 utf_java_lang_UnsupportedClassVersionError =
284 utf_new_char("java/lang/UnsupportedClassVersionError");
286 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
288 utf_java_lang_VirtualMachineError =
289 utf_new_char("java/lang/VirtualMachineError");
291 #if defined(ENABLE_JAVASE)
292 utf_java_lang_AbstractMethodError =
293 utf_new_char("java/lang/AbstractMethodError");
295 utf_java_lang_NoSuchFieldError =
296 utf_new_char("java/lang/NoSuchFieldError");
298 utf_java_lang_NoSuchMethodError =
299 utf_new_char("java/lang/NoSuchMethodError");
302 #if defined(WITH_CLASSPATH_GNU)
303 utf_java_lang_VMThrowable = utf_new_char("java/lang/VMThrowable");
306 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
308 utf_java_lang_ArithmeticException =
309 utf_new_char("java/lang/ArithmeticException");
311 utf_java_lang_ArrayIndexOutOfBoundsException =
312 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
314 utf_java_lang_ArrayStoreException =
315 utf_new_char("java/lang/ArrayStoreException");
317 utf_java_lang_ClassCastException =
318 utf_new_char("java/lang/ClassCastException");
320 utf_java_lang_ClassNotFoundException =
321 utf_new_char("java/lang/ClassNotFoundException");
323 utf_java_lang_CloneNotSupportedException =
324 utf_new_char("java/lang/CloneNotSupportedException");
326 utf_java_lang_IllegalAccessException =
327 utf_new_char("java/lang/IllegalAccessException");
329 utf_java_lang_IllegalArgumentException =
330 utf_new_char("java/lang/IllegalArgumentException");
332 utf_java_lang_IllegalMonitorStateException =
333 utf_new_char("java/lang/IllegalMonitorStateException");
335 utf_java_lang_InstantiationException =
336 utf_new_char("java/lang/InstantiationException");
338 utf_java_lang_InterruptedException =
339 utf_new_char("java/lang/InterruptedException");
341 utf_java_lang_NegativeArraySizeException =
342 utf_new_char("java/lang/NegativeArraySizeException");
344 utf_java_lang_NullPointerException =
345 utf_new_char("java/lang/NullPointerException");
347 utf_java_lang_StringIndexOutOfBoundsException =
348 utf_new_char("java/lang/StringIndexOutOfBoundsException");
350 utf_java_lang_reflect_InvocationTargetException =
351 utf_new_char("java/lang/reflect/InvocationTargetException");
353 #if defined(ENABLE_JAVASE)
354 utf_java_lang_Void = utf_new_char("java/lang/Void");
357 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
358 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
359 utf_java_lang_Character = utf_new_char("java/lang/Character");
360 utf_java_lang_Short = utf_new_char("java/lang/Short");
361 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
362 utf_java_lang_Long = utf_new_char("java/lang/Long");
363 utf_java_lang_Float = utf_new_char("java/lang/Float");
364 utf_java_lang_Double = utf_new_char("java/lang/Double");
366 #if defined(ENABLE_JAVASE)
367 utf_java_lang_StackTraceElement =
368 utf_new_char("java/lang/StackTraceElement");
370 utf_java_lang_reflect_Constructor =
371 utf_new_char("java/lang/reflect/Constructor");
373 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
374 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
375 utf_java_util_Vector = utf_new_char("java/util/Vector");
378 utf_InnerClasses = utf_new_char("InnerClasses");
379 utf_ConstantValue = utf_new_char("ConstantValue");
380 utf_Code = utf_new_char("Code");
381 utf_Exceptions = utf_new_char("Exceptions");
382 utf_LineNumberTable = utf_new_char("LineNumberTable");
383 utf_SourceFile = utf_new_char("SourceFile");
385 #if defined(ENABLE_JAVASE)
386 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
387 utf_Signature = utf_new_char("Signature");
388 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
389 utf_StackMapTable = utf_new_char("StackMapTable");
392 utf_init = utf_new_char("<init>");
393 utf_clinit = utf_new_char("<clinit>");
394 utf_clone = utf_new_char("clone");
395 utf_finalize = utf_new_char("finalize");
396 utf_run = utf_new_char("run");
398 utf_add = utf_new_char("add");
399 utf_remove = utf_new_char("remove");
400 utf_addThread = utf_new_char("addThread");
401 utf_removeThread = utf_new_char("removeThread");
402 utf_put = utf_new_char("put");
403 utf_get = utf_new_char("get");
404 utf_value = utf_new_char("value");
406 utf_printStackTrace = utf_new_char("printStackTrace");
407 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
408 utf_loadClass = utf_new_char("loadClass");
409 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
411 utf_Z = utf_new_char("Z");
412 utf_B = utf_new_char("B");
413 utf_C = utf_new_char("C");
414 utf_S = utf_new_char("S");
415 utf_I = utf_new_char("I");
416 utf_J = utf_new_char("J");
417 utf_F = utf_new_char("F");
418 utf_D = utf_new_char("D");
420 utf_void__void = utf_new_char("()V");
421 utf_boolean__void = utf_new_char("(Z)V");
422 utf_byte__void = utf_new_char("(B)V");
423 utf_char__void = utf_new_char("(C)V");
424 utf_short__void = utf_new_char("(S)V");
425 utf_int__void = utf_new_char("(I)V");
426 utf_long__void = utf_new_char("(J)V");
427 utf_float__void = utf_new_char("(F)V");
428 utf_double__void = utf_new_char("(D)V");
429 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
430 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
432 utf_void__java_lang_ClassLoader =
433 utf_new_char("()Ljava/lang/ClassLoader;");
435 utf_java_lang_Object__java_lang_Object =
436 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
438 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
440 utf_java_lang_String__java_lang_Class =
441 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
443 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
444 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
446 utf_null = utf_new_char("null");
447 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
448 array_packagename = utf_new_char("\t<the array package>");
450 /* everything's ok */
456 /* utf_hashkey *****************************************************************
458 The hashkey is computed from the utf-text by using up to 8
459 characters. For utf-symbols longer than 15 characters 3 characters
460 are taken from the beginning and the end, 2 characters are taken
463 *******************************************************************************/
465 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
466 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
468 u4 utf_hashkey(const char *text, u4 length)
470 const char *start_pos = text; /* pointer to utf text */
474 case 0: /* empty string */
477 case 1: return fbs(0);
478 case 2: return fbs(0) ^ nbs(3);
479 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
480 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
481 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
482 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
483 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
484 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
491 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
500 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
509 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
521 return a ^ nbs(9) ^ nbs(10);
533 return a ^ nbs(9) ^ nbs(10);
544 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
555 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
557 default: /* 3 characters from beginning */
563 /* 2 characters from middle */
564 text = start_pos + (length / 2);
569 /* 3 characters from end */
570 text = start_pos + length - 4;
575 return a ^ nbs(10) ^ nbs(11);
579 /* utf_full_hashkey ************************************************************
581 This function computes a hash value using all bytes in the string.
583 The algorithm is the "One-at-a-time" algorithm as published
584 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
586 *******************************************************************************/
588 u4 utf_full_hashkey(const char *text, u4 length)
590 register const unsigned char *p = (const unsigned char *) text;
598 hash += (hash << 10);
602 hash ^= (hash >> 11);
603 hash += (hash << 15);
608 /* unicode_hashkey *************************************************************
610 Compute the hashkey of a unicode string.
612 *******************************************************************************/
614 u4 unicode_hashkey(u2 *text, u2 len)
616 return utf_hashkey((char *) text, len);
620 /* utf_new *********************************************************************
622 Creates a new utf-symbol, the text of the symbol is passed as a
623 u1-array. The function searches the utf-hashtable for a utf-symbol
624 with this text. On success the element returned, otherwise a new
625 hashtable element is created.
627 If the number of entries in the hashtable exceeds twice the size of
628 the hashtable slots a reorganization of the hashtable is done and
629 the utf symbols are copied to a new hashtable with doubled size.
631 *******************************************************************************/
633 utf *utf_new(const char *text, u2 length)
635 u4 key; /* hashkey computed from utf-text */
636 u4 slot; /* slot in hashtable */
637 utf *u; /* hashtable element */
640 LOCK_MONITOR_ENTER(hashtable_utf->header);
642 #if defined(ENABLE_STATISTICS)
647 key = utf_hashkey(text, length);
648 slot = key & (hashtable_utf->size - 1);
649 u = hashtable_utf->ptr[slot];
651 /* search external hash chain for utf-symbol */
654 if (u->blength == length) {
655 /* compare text of hashtable elements */
657 for (i = 0; i < length; i++)
658 if (text[i] != u->text[i])
661 #if defined(ENABLE_STATISTICS)
663 count_utf_new_found++;
666 /* symbol found in hashtable */
668 LOCK_MONITOR_EXIT(hashtable_utf->header);
674 u = u->hashlink; /* next element in external chain */
677 /* location in hashtable found, create new utf element */
681 u->blength = length; /* length in bytes of utfstring */
682 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
683 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
685 memcpy(u->text, text, length); /* copy utf-text */
686 u->text[length] = '\0';
688 #if defined(ENABLE_STATISTICS)
690 count_utf_len += sizeof(utf) + length + 1;
693 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
694 hashtable_utf->entries++; /* update number of entries */
696 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
698 /* reorganization of hashtable, average length of the external
699 chains is approx. 2 */
701 hashtable *newhash; /* the new hashtable */
707 /* create new hashtable, double the size */
709 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
711 #if defined(ENABLE_STATISTICS)
713 count_utf_len += sizeof(utf*) * hashtable_utf->size;
716 /* transfer elements to new hashtable */
718 for (i = 0; i < hashtable_utf->size; i++) {
719 u = hashtable_utf->ptr[i];
723 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
725 u->hashlink = (utf *) newhash->ptr[slot];
726 newhash->ptr[slot] = u;
728 /* follow link in external hash chain */
734 /* dispose old table */
736 hashtable_free(hashtable_utf);
738 hashtable_utf = newhash;
741 LOCK_MONITOR_EXIT(hashtable_utf->header);
747 /* utf_new_u2 ******************************************************************
749 Make utf symbol from u2 array, if isclassname is true '.' is
752 *******************************************************************************/
754 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
756 char *buffer; /* memory buffer for unicode characters */
757 char *pos; /* pointer to current position in buffer */
758 u4 left; /* unicode characters left */
759 u4 buflength; /* utf length in bytes of the u2 array */
760 utf *result; /* resulting utf-string */
763 /* determine utf length in bytes and allocate memory */
765 buflength = u2_utflength(unicode_pos, unicode_length);
766 buffer = MNEW(char, buflength);
771 for (i = 0; i++ < unicode_length; unicode_pos++) {
772 /* next unicode character */
775 if ((c != 0) && (c < 0x80)) {
778 if ((int) left < 0) break;
779 /* convert classname */
780 if (isclassname && c == '.')
785 } else if (c < 0x800) {
787 unsigned char high = c >> 6;
788 unsigned char low = c & 0x3F;
790 if ((int) left < 0) break;
791 *pos++ = high | 0xC0;
797 char mid = (c >> 6) & 0x3F;
800 if ((int) left < 0) break;
801 *pos++ = high | 0xE0;
807 /* insert utf-string into symbol-table */
808 result = utf_new(buffer,buflength);
810 MFREE(buffer, char, buflength);
816 /* utf_new_char ****************************************************************
818 Creates a new utf symbol, the text for this symbol is passed as a
819 c-string ( = char* ).
821 *******************************************************************************/
823 utf *utf_new_char(const char *text)
825 return utf_new(text, strlen(text));
829 /* utf_new_char_classname ******************************************************
831 Creates a new utf symbol, the text for this symbol is passed as a
832 c-string ( = char* ) "." characters are going to be replaced by
833 "/". Since the above function is used often, this is a separte
834 function, instead of an if.
836 *******************************************************************************/
838 utf *utf_new_char_classname(const char *text)
840 if (strchr(text, '.')) {
841 char *txt = strdup(text);
842 char *end = txt + strlen(txt);
846 for (c = txt; c < end; c++)
847 if (*c == '.') *c = '/';
849 tmpRes = utf_new(txt, strlen(txt));
855 return utf_new(text, strlen(text));
859 /* utf_nextu2 ******************************************************************
861 Read the next unicode character from the utf string and increment
862 the utf-string pointer accordingly.
864 CAUTION: This function is unsafe for input that was not checked
867 *******************************************************************************/
869 u2 utf_nextu2(char **utf_ptr)
871 /* uncompressed unicode character */
873 /* current position in utf text */
874 unsigned char *utf = (unsigned char *) (*utf_ptr);
875 /* bytes representing the unicode character */
876 unsigned char ch1, ch2, ch3;
877 /* number of bytes used to represent the unicode character */
880 switch ((ch1 = utf[0]) >> 4) {
881 default: /* 1 byte */
885 case 0xD: /* 2 bytes */
886 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
887 unsigned char high = ch1 & 0x1F;
888 unsigned char low = ch2 & 0x3F;
889 unicode_char = (high << 6) + low;
894 case 0xE: /* 2 or 3 bytes */
895 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
896 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
897 unsigned char low = ch3 & 0x3f;
898 unsigned char mid = ch2 & 0x3f;
899 unsigned char high = ch1 & 0x0f;
900 unicode_char = (((high << 6) + mid) << 6) + low;
908 /* update position in utf-text */
909 *utf_ptr = (char *) (utf + len);
915 /* utf_bytes *******************************************************************
917 Determine number of bytes (aka. octets) in the utf string.
920 u............utf string
923 The number of octets of this utf string.
924 There is _no_ terminating zero included in this count.
926 *******************************************************************************/
934 /* utf_get_number_of_u2s_for_buffer ********************************************
936 Determine number of UTF-16 u2s in the given UTF-8 buffer
938 CAUTION: This function is unsafe for input that was not checked
941 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
942 to an array of u2s (UTF-16) and want to know how many of them you will get.
943 All other uses of this function are probably wrong.
946 buffer........points to first char in buffer
947 blength.......number of _bytes_ in the buffer
950 the number of u2s needed to hold this string in UTF-16 encoding.
951 There is _no_ terminating zero included in this count.
953 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
956 *******************************************************************************/
958 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
960 const char *endpos; /* points behind utf string */
961 const char *utf_ptr; /* current position in utf text */
962 u4 len = 0; /* number of unicode characters */
965 endpos = utf_ptr + blength;
967 while (utf_ptr < endpos) {
969 /* next unicode character */
970 utf_nextu2((char **)&utf_ptr);
973 assert(utf_ptr == endpos);
979 /* utf_get_number_of_u2s *******************************************************
981 Determine number of UTF-16 u2s in the utf string.
983 CAUTION: This function is unsafe for input that was not checked
986 CAUTION: Use this function *only* when you want to convert a utf string
987 to an array of u2s and want to know how many of them you will get.
988 All other uses of this function are probably wrong.
991 u............utf string
994 the number of u2s needed to hold this string in UTF-16 encoding.
995 There is _no_ terminating zero included in this count.
996 XXX 0 if a NullPointerException has been thrown (see below)
998 *******************************************************************************/
1000 u4 utf_get_number_of_u2s(utf *u)
1002 char *endpos; /* points behind utf string */
1003 char *utf_ptr; /* current position in utf text */
1004 u4 len = 0; /* number of unicode characters */
1006 /* XXX this is probably not checked by most callers! Review this after */
1007 /* the invalid uses of this function have been eliminated */
1009 exceptions_throw_nullpointerexception();
1013 endpos = UTF_END(u);
1016 while (utf_ptr < endpos) {
1018 /* next unicode character */
1019 utf_nextu2(&utf_ptr);
1022 if (utf_ptr != endpos) {
1023 /* string ended abruptly */
1024 exceptions_throw_internalerror("Illegal utf8 string");
1032 /* utf8_safe_number_of_u2s *****************************************************
1034 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1035 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1037 This function is safe even for invalid UTF-8 strings.
1040 text..........zero-terminated(!) UTF-8 string (may be invalid)
1042 nbytes........strlen(text). (This is needed to completely emulate
1046 the number of u2s needed to hold this string in UTF-16 encoding.
1047 There is _no_ terminating zero included in this count.
1049 *******************************************************************************/
1051 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1052 register const unsigned char *t;
1055 register const unsigned char *tlimit;
1063 assert(nbytes >= 0);
1066 t = (const unsigned char *) text;
1067 tlimit = t + nbytes;
1069 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1075 /* highest bit set, non-ASCII character */
1077 if ((byte & 0xe0) == 0xc0) {
1078 /* 2-byte: should be 110..... 10...... ? */
1080 if ((*t++ & 0xc0) == 0x80)
1081 ; /* valid 2-byte */
1085 else if ((byte & 0xf0) == 0xe0) {
1086 /* 3-byte: should be 1110.... 10...... 10...... */
1090 return len + 1; /* invalid, stop here */
1092 if ((*t++ & 0xc0) == 0x80) {
1093 if ((*t++ & 0xc0) == 0x80)
1094 ; /* valid 3-byte */
1101 else if ((byte & 0xf8) == 0xf0) {
1102 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1106 return len + 1; /* invalid, stop here */
1108 if (((byte1 = *t++) & 0xc0) == 0x80) {
1109 if (((byte2 = *t++) & 0xc0) == 0x80) {
1110 if (((byte3 = *t++) & 0xc0) == 0x80) {
1111 /* valid 4-byte UTF-8? */
1112 value = ((byte & 0x07) << 18)
1113 | ((byte1 & 0x3f) << 12)
1114 | ((byte2 & 0x3f) << 6)
1115 | ((byte3 & 0x3f) );
1117 if (value > 0x10FFFF)
1119 else if (value > 0xFFFF)
1120 len += 1; /* we need surrogates */
1122 ; /* 16bit suffice */
1133 else if ((byte & 0xfc) == 0xf8) {
1134 /* invalid 5-byte */
1136 return len + 1; /* invalid, stop here */
1139 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1142 else if ((byte & 0xfe) == 0xfc) {
1143 /* invalid 6-byte */
1145 return len + 1; /* invalid, stop here */
1148 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1160 /* ASCII character, common case */
1170 /* utf8_safe_convert_to_u2s ****************************************************
1172 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1173 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1174 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1176 This function is safe even for invalid UTF-8 strings.
1179 text..........zero-terminated(!) UTF-8 string (may be invalid)
1181 nbytes........strlen(text). (This is needed to completely emulate
1183 buffer........a preallocated array of u2s to receive the decoded
1184 string. Use utf8_safe_number_of_u2s to get the
1185 required number of u2s for allocating this.
1187 *******************************************************************************/
1189 #define UNICODE_REPLACEMENT 0xfffd
1191 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1192 register const unsigned char *t;
1194 register const unsigned char *tlimit;
1202 assert(nbytes >= 0);
1204 t = (const unsigned char *) text;
1205 tlimit = t + nbytes;
1207 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1213 /* highest bit set, non-ASCII character */
1215 if ((byte & 0xe0) == 0xc0) {
1216 /* 2-byte: should be 110..... 10...... */
1218 if (((byte1 = *t++) & 0xc0) == 0x80) {
1219 /* valid 2-byte UTF-8 */
1220 *buffer++ = ((byte & 0x1f) << 6)
1221 | ((byte1 & 0x3f) );
1224 *buffer++ = UNICODE_REPLACEMENT;
1228 else if ((byte & 0xf0) == 0xe0) {
1229 /* 3-byte: should be 1110.... 10...... 10...... */
1231 if (t + 2 > tlimit) {
1232 *buffer++ = UNICODE_REPLACEMENT;
1236 if (((byte1 = *t++) & 0xc0) == 0x80) {
1237 if (((byte2 = *t++) & 0xc0) == 0x80) {
1238 /* valid 3-byte UTF-8 */
1239 *buffer++ = ((byte & 0x0f) << 12)
1240 | ((byte1 & 0x3f) << 6)
1241 | ((byte2 & 0x3f) );
1244 *buffer++ = UNICODE_REPLACEMENT;
1249 *buffer++ = UNICODE_REPLACEMENT;
1253 else if ((byte & 0xf8) == 0xf0) {
1254 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1256 if (t + 3 > tlimit) {
1257 *buffer++ = UNICODE_REPLACEMENT;
1261 if (((byte1 = *t++) & 0xc0) == 0x80) {
1262 if (((byte2 = *t++) & 0xc0) == 0x80) {
1263 if (((byte3 = *t++) & 0xc0) == 0x80) {
1264 /* valid 4-byte UTF-8? */
1265 value = ((byte & 0x07) << 18)
1266 | ((byte1 & 0x3f) << 12)
1267 | ((byte2 & 0x3f) << 6)
1268 | ((byte3 & 0x3f) );
1270 if (value > 0x10FFFF) {
1271 *buffer++ = UNICODE_REPLACEMENT;
1273 else if (value > 0xFFFF) {
1274 /* we need surrogates */
1275 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1276 *buffer++ = 0xdc00 | (value & 0x03ff);
1279 *buffer++ = value; /* 16bit suffice */
1282 *buffer++ = UNICODE_REPLACEMENT;
1287 *buffer++ = UNICODE_REPLACEMENT;
1292 *buffer++ = UNICODE_REPLACEMENT;
1296 else if ((byte & 0xfc) == 0xf8) {
1297 if (t + 4 > tlimit) {
1298 *buffer++ = UNICODE_REPLACEMENT;
1303 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1305 *buffer++ = UNICODE_REPLACEMENT;
1307 else if ((byte & 0xfe) == 0xfc) {
1308 if (t + 5 > tlimit) {
1309 *buffer++ = UNICODE_REPLACEMENT;
1314 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1316 *buffer++ = UNICODE_REPLACEMENT;
1319 *buffer++ = UNICODE_REPLACEMENT;
1327 /* ASCII character, common case */
1335 /* u2_utflength ****************************************************************
1337 Returns the utf length in bytes of a u2 array.
1339 *******************************************************************************/
1341 u4 u2_utflength(u2 *text, u4 u2_length)
1343 u4 result_len = 0; /* utf length in bytes */
1344 u2 ch; /* current unicode character */
1347 for (len = 0; len < u2_length; len++) {
1348 /* next unicode character */
1351 /* determine bytes required to store unicode character as utf */
1352 if (ch && (ch < 0x80))
1354 else if (ch < 0x800)
1364 /* utf_copy ********************************************************************
1366 Copy the given utf string byte-for-byte to a buffer.
1369 buffer.......the buffer
1370 u............the utf string
1372 *******************************************************************************/
1374 void utf_copy(char *buffer, utf *u)
1376 /* our utf strings are zero-terminated (done by utf_new) */
1377 MCOPY(buffer, u->text, char, u->blength + 1);
1381 /* utf_cat *********************************************************************
1383 Append the given utf string byte-for-byte to a buffer.
1386 buffer.......the buffer
1387 u............the utf string
1389 *******************************************************************************/
1391 void utf_cat(char *buffer, utf *u)
1393 /* our utf strings are zero-terminated (done by utf_new) */
1394 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1398 /* utf_copy_classname **********************************************************
1400 Copy the given utf classname byte-for-byte to a buffer.
1401 '/' is replaced by '.'
1404 buffer.......the buffer
1405 u............the utf string
1407 *******************************************************************************/
1409 void utf_copy_classname(char *buffer, utf *u)
1418 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1420 while (srcptr != endptr) {
1429 /* utf_cat *********************************************************************
1431 Append the given utf classname byte-for-byte to a buffer.
1432 '/' is replaced by '.'
1435 buffer.......the buffer
1436 u............the utf string
1438 *******************************************************************************/
1440 void utf_cat_classname(char *buffer, utf *u)
1442 utf_copy_classname(buffer + strlen(buffer), u);
1445 /* utf_display_printable_ascii *************************************************
1447 Write utf symbol to stdout (for debugging purposes).
1448 Non-printable and non-ASCII characters are printed as '?'.
1450 *******************************************************************************/
1452 void utf_display_printable_ascii(utf *u)
1454 char *endpos; /* points behind utf string */
1455 char *utf_ptr; /* current position in utf text */
1463 endpos = UTF_END(u);
1466 while (utf_ptr < endpos) {
1467 /* read next unicode character */
1469 u2 c = utf_nextu2(&utf_ptr);
1471 if ((c >= 32) && (c <= 127))
1481 /* utf_display_printable_ascii_classname ***************************************
1483 Write utf symbol to stdout with `/' converted to `.' (for debugging
1485 Non-printable and non-ASCII characters are printed as '?'.
1487 *******************************************************************************/
1489 void utf_display_printable_ascii_classname(utf *u)
1491 char *endpos; /* points behind utf string */
1492 char *utf_ptr; /* current position in utf text */
1500 endpos = UTF_END(u);
1503 while (utf_ptr < endpos) {
1504 /* read next unicode character */
1506 u2 c = utf_nextu2(&utf_ptr);
1511 if ((c >= 32) && (c <= 127))
1521 /* utf_sprint_convert_to_latin1 ************************************************
1523 Write utf symbol into c-string (for debugging purposes).
1524 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1527 *******************************************************************************/
1529 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1531 char *endpos; /* points behind utf string */
1532 char *utf_ptr; /* current position in utf text */
1533 u2 pos = 0; /* position in c-string */
1536 strcpy(buffer, "NULL");
1540 endpos = UTF_END(u);
1543 while (utf_ptr < endpos)
1544 /* copy next unicode character */
1545 buffer[pos++] = utf_nextu2(&utf_ptr);
1547 /* terminate string */
1552 /* utf_sprint_convert_to_latin1_classname **************************************
1554 Write utf symbol into c-string with `/' converted to `.' (for debugging
1556 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1559 *******************************************************************************/
1561 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1563 char *endpos; /* points behind utf string */
1564 char *utf_ptr; /* current position in utf text */
1565 u2 pos = 0; /* position in c-string */
1568 strcpy(buffer, "NULL");
1572 endpos = UTF_END(u);
1575 while (utf_ptr < endpos) {
1576 /* copy next unicode character */
1577 u2 c = utf_nextu2(&utf_ptr);
1578 if (c == '/') c = '.';
1582 /* terminate string */
1587 /* utf_strcat_convert_to_latin1 ************************************************
1589 Like libc strcat, but uses an utf8 string.
1590 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1593 *******************************************************************************/
1595 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1597 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1601 /* utf_strcat_convert_to_latin1_classname **************************************
1603 Like libc strcat, but uses an utf8 string.
1604 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1607 *******************************************************************************/
1609 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1611 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1615 /* utf_fprint_printable_ascii **************************************************
1617 Write utf symbol into file.
1618 Non-printable and non-ASCII characters are printed as '?'.
1620 *******************************************************************************/
1622 void utf_fprint_printable_ascii(FILE *file, utf *u)
1624 char *endpos; /* points behind utf string */
1625 char *utf_ptr; /* current position in utf text */
1630 endpos = UTF_END(u);
1633 while (utf_ptr < endpos) {
1634 /* read next unicode character */
1635 u2 c = utf_nextu2(&utf_ptr);
1637 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1638 else fprintf(file, "?");
1643 /* utf_fprint_printable_ascii_classname ****************************************
1645 Write utf symbol into file with `/' converted to `.'.
1646 Non-printable and non-ASCII characters are printed as '?'.
1648 *******************************************************************************/
1650 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1652 char *endpos; /* points behind utf string */
1653 char *utf_ptr; /* current position in utf text */
1658 endpos = UTF_END(u);
1661 while (utf_ptr < endpos) {
1662 /* read next unicode character */
1663 u2 c = utf_nextu2(&utf_ptr);
1664 if (c == '/') c = '.';
1666 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1667 else fprintf(file, "?");
1672 /* is_valid_utf ****************************************************************
1674 Return true if the given string is a valid UTF-8 string.
1676 utf_ptr...points to first character
1677 end_pos...points after last character
1679 *******************************************************************************/
1681 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1683 bool is_valid_utf(char *utf_ptr, char *end_pos)
1690 if (end_pos < utf_ptr) return false;
1691 bytes = end_pos - utf_ptr;
1695 if (!c) return false; /* 0x00 is not allowed */
1696 if ((c & 0x80) == 0) continue; /* ASCII */
1698 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1699 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1700 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1701 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1702 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1703 else return false; /* invalid leading byte */
1705 if (len > 2) return false; /* Java limitation */
1707 v = (unsigned long)c & (0x3f >> len);
1709 if ((bytes -= len) < 0) return false; /* missing bytes */
1711 for (i = len; i--; ) {
1713 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1715 v = (v << 6) | (c & 0x3f);
1719 if (len != 1) return false; /* Java special */
1722 /* Sun Java seems to allow overlong UTF-8 encodings */
1724 /* if (v < min_codepoint[len]) */
1725 /* XXX throw exception? */
1728 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1729 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1731 /* even these seem to be allowed */
1732 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1739 /* is_valid_name ***************************************************************
1741 Return true if the given string may be used as a class/field/method
1742 name. (Currently this only disallows empty strings and control
1745 NOTE: The string is assumed to have passed is_valid_utf!
1747 utf_ptr...points to first character
1748 end_pos...points after last character
1750 *******************************************************************************/
1752 bool is_valid_name(char *utf_ptr, char *end_pos)
1754 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1756 while (utf_ptr < end_pos) {
1757 unsigned char c = *utf_ptr++;
1759 if (c < 0x20) return false; /* disallow control characters */
1760 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1767 bool is_valid_name_utf(utf *u)
1769 return is_valid_name(u->text, UTF_END(u));
1773 /* utf_show ********************************************************************
1775 Writes the utf symbols in the utfhash to stdout and displays the
1776 number of external hash chains grouped according to the chainlength
1777 (for debugging purposes).
1779 *******************************************************************************/
1781 #if !defined(NDEBUG)
1785 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1787 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1788 u4 max_chainlength = 0; /* maximum length of the chains */
1789 u4 sum_chainlength = 0; /* sum of the chainlengths */
1790 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1793 printf("UTF-HASH:\n");
1795 /* show element of utf-hashtable */
1797 for (i = 0; i < hashtable_utf->size; i++) {
1798 utf *u = hashtable_utf->ptr[i];
1801 printf("SLOT %d: ", (int) i);
1805 utf_display_printable_ascii(u);
1813 printf("UTF-HASH: %d slots for %d entries\n",
1814 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1816 if (hashtable_utf->entries == 0)
1819 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1821 for (i=0;i<CHAIN_LIMIT;i++)
1824 /* count numbers of hashchains according to their length */
1825 for (i=0; i<hashtable_utf->size; i++) {
1827 utf *u = (utf*) hashtable_utf->ptr[i];
1828 u4 chain_length = 0;
1830 /* determine chainlength */
1836 /* update sum of all chainlengths */
1837 sum_chainlength+=chain_length;
1839 /* determine the maximum length of the chains */
1840 if (chain_length>max_chainlength)
1841 max_chainlength = chain_length;
1843 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1844 if (chain_length>=CHAIN_LIMIT) {
1845 beyond_limit+=chain_length;
1846 chain_length=CHAIN_LIMIT-1;
1849 /* update number of hashchains of current length */
1850 chain_count[chain_length]++;
1853 /* display results */
1854 for (i=1;i<CHAIN_LIMIT-1;i++)
1855 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1857 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1860 printf("max. chainlength:%5d\n",max_chainlength);
1862 /* avg. chainlength = sum of chainlengths / number of chains */
1863 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1865 #endif /* !defined(NDEBUG) */
1869 * These are local overrides for various environment variables in Emacs.
1870 * Please do not remove this and leave it at the end of the file, where
1871 * Emacs will automagically detect them.
1872 * ---------------------------------------------------------------------
1875 * indent-tabs-mode: t
1879 * vim:noexpandtab:sw=4:ts=4: