1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 $Id: utf8.c 7813 2007-04-25 19:20:13Z twisti $
37 #include "mm/memory.h"
39 #include "threads/lock-common.h"
41 #include "toolbox/hashtable.h"
43 #include "vm/exceptions.h"
45 #include "vmcore/options.h"
47 #if defined(ENABLE_STATISTICS)
48 # include "vmcore/statistics.h"
51 #include "vmcore/utf8.h"
54 /* global variables ***********************************************************/
56 /* hashsize must be power of 2 */
58 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
60 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
63 /* utf-symbols for pointer comparison of frequently used strings **************/
65 utf *utf_java_lang_Object;
67 utf *utf_java_lang_Class;
68 utf *utf_java_lang_ClassLoader;
69 utf *utf_java_lang_Cloneable;
70 utf *utf_java_lang_SecurityManager;
71 utf *utf_java_lang_String;
72 utf *utf_java_lang_System;
73 utf *utf_java_lang_ThreadGroup;
74 utf *utf_java_lang_ref_SoftReference;
75 utf *utf_java_lang_ref_WeakReference;
76 utf *utf_java_lang_ref_PhantomReference;
77 utf *utf_java_io_Serializable;
79 utf *utf_java_lang_Throwable;
80 utf *utf_java_lang_Error;
82 utf *utf_java_lang_AbstractMethodError;
83 utf *utf_java_lang_ClassCircularityError;
84 utf *utf_java_lang_ClassFormatError;
85 utf *utf_java_lang_ExceptionInInitializerError;
86 utf *utf_java_lang_IncompatibleClassChangeError;
87 utf *utf_java_lang_InstantiationError;
88 utf *utf_java_lang_InternalError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchFieldError;
92 utf *utf_java_lang_NoSuchMethodError;
93 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_UnsatisfiedLinkError;
95 utf *utf_java_lang_UnsupportedClassVersionError;
96 utf *utf_java_lang_VerifyError;
97 utf *utf_java_lang_VirtualMachineError;
99 #if defined(WITH_CLASSPATH_GNU)
100 utf *utf_java_lang_VMThrowable;
103 utf *utf_java_lang_Exception;
105 utf *utf_java_lang_ArithmeticException;
106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
107 utf *utf_java_lang_ArrayStoreException;
108 utf *utf_java_lang_ClassCastException;
109 utf *utf_java_lang_ClassNotFoundException;
110 utf *utf_java_lang_CloneNotSupportedException;
111 utf *utf_java_lang_IllegalAccessException;
112 utf *utf_java_lang_IllegalArgumentException;
113 utf *utf_java_lang_IllegalMonitorStateException;
114 utf *utf_java_lang_InstantiationException;
115 utf *utf_java_lang_InterruptedException;
116 utf *utf_java_lang_NegativeArraySizeException;
117 utf *utf_java_lang_NullPointerException;
118 utf *utf_java_lang_StringIndexOutOfBoundsException;
120 utf *utf_java_lang_reflect_InvocationTargetException;
122 #if defined(ENABLE_JAVASE)
123 utf* utf_java_lang_Void;
126 utf* utf_java_lang_Boolean;
127 utf* utf_java_lang_Byte;
128 utf* utf_java_lang_Character;
129 utf* utf_java_lang_Short;
130 utf* utf_java_lang_Integer;
131 utf* utf_java_lang_Long;
132 utf* utf_java_lang_Float;
133 utf* utf_java_lang_Double;
135 #if defined(ENABLE_JAVASE)
136 utf *utf_java_lang_StackTraceElement;
137 utf *utf_java_lang_reflect_Constructor;
138 utf *utf_java_lang_reflect_Field;
139 utf *utf_java_lang_reflect_Method;
140 utf *utf_java_util_Vector;
143 utf *utf_InnerClasses; /* InnerClasses */
144 utf *utf_ConstantValue; /* ConstantValue */
145 utf *utf_Code; /* Code */
146 utf *utf_Exceptions; /* Exceptions */
147 utf *utf_LineNumberTable; /* LineNumberTable */
148 utf *utf_SourceFile; /* SourceFile */
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
153 utf *utf_RuntimeVisibleAnnotations;
154 utf *utf_StackMapTable;
157 utf *utf_init; /* <init> */
158 utf *utf_clinit; /* <clinit> */
159 utf *utf_clone; /* clone */
160 utf *utf_finalize; /* finalize */
161 utf *utf_run; /* run */
166 utf *utf_removeThread;
171 utf *utf_fillInStackTrace;
172 utf *utf_getSystemClassLoader;
174 utf *utf_printStackTrace;
185 utf *utf_void__void; /* ()V */
186 utf *utf_boolean__void; /* (Z)V */
187 utf *utf_byte__void; /* (B)V */
188 utf *utf_char__void; /* (C)V */
189 utf *utf_short__void; /* (S)V */
190 utf *utf_int__void; /* (I)V */
191 utf *utf_long__void; /* (J)V */
192 utf *utf_float__void; /* (F)V */
193 utf *utf_double__void; /* (D)V */
195 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
196 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
197 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
198 utf *utf_java_lang_Object__java_lang_Object;
199 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
200 utf *utf_java_lang_String__java_lang_Class;
201 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
202 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
204 utf *utf_not_named_yet; /* special name for unnamed classes */
206 utf *array_packagename;
209 /* utf_init ********************************************************************
211 Initializes the utf8 subsystem.
213 *******************************************************************************/
217 /* create utf8 hashtable */
219 hashtable_utf = NEW(hashtable);
221 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
223 #if defined(ENABLE_STATISTICS)
225 count_utf_len += sizeof(utf*) * hashtable_utf->size;
228 /* create utf-symbols for pointer comparison of frequently used strings */
230 utf_java_lang_Object = utf_new_char("java/lang/Object");
232 utf_java_lang_Class = utf_new_char("java/lang/Class");
233 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
234 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
235 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
236 utf_java_lang_String = utf_new_char("java/lang/String");
237 utf_java_lang_System = utf_new_char("java/lang/System");
238 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
240 utf_java_lang_ref_SoftReference =
241 utf_new_char("java/lang/ref/SoftReference");
243 utf_java_lang_ref_WeakReference =
244 utf_new_char("java/lang/ref/WeakReference");
246 utf_java_lang_ref_PhantomReference =
247 utf_new_char("java/lang/ref/PhantomReference");
249 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
251 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
252 utf_java_lang_Error = utf_new_char("java/lang/Error");
254 utf_java_lang_ClassCircularityError =
255 utf_new_char("java/lang/ClassCircularityError");
257 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
259 utf_java_lang_ExceptionInInitializerError =
260 utf_new_char("java/lang/ExceptionInInitializerError");
262 utf_java_lang_IncompatibleClassChangeError =
263 utf_new_char("java/lang/IncompatibleClassChangeError");
265 utf_java_lang_InstantiationError =
266 utf_new_char("java/lang/InstantiationError");
268 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
269 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
271 utf_java_lang_NoClassDefFoundError =
272 utf_new_char("java/lang/NoClassDefFoundError");
274 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
276 utf_java_lang_UnsatisfiedLinkError =
277 utf_new_char("java/lang/UnsatisfiedLinkError");
279 utf_java_lang_UnsupportedClassVersionError =
280 utf_new_char("java/lang/UnsupportedClassVersionError");
282 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
284 utf_java_lang_VirtualMachineError =
285 utf_new_char("java/lang/VirtualMachineError");
287 #if defined(ENABLE_JAVASE)
288 utf_java_lang_AbstractMethodError =
289 utf_new_char("java/lang/AbstractMethodError");
291 utf_java_lang_NoSuchFieldError =
292 utf_new_char("java/lang/NoSuchFieldError");
294 utf_java_lang_NoSuchMethodError =
295 utf_new_char("java/lang/NoSuchMethodError");
298 #if defined(WITH_CLASSPATH_GNU)
299 utf_java_lang_VMThrowable = utf_new_char("java/lang/VMThrowable");
302 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
304 utf_java_lang_ArithmeticException =
305 utf_new_char("java/lang/ArithmeticException");
307 utf_java_lang_ArrayIndexOutOfBoundsException =
308 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
310 utf_java_lang_ArrayStoreException =
311 utf_new_char("java/lang/ArrayStoreException");
313 utf_java_lang_ClassCastException =
314 utf_new_char("java/lang/ClassCastException");
316 utf_java_lang_ClassNotFoundException =
317 utf_new_char("java/lang/ClassNotFoundException");
319 utf_java_lang_CloneNotSupportedException =
320 utf_new_char("java/lang/CloneNotSupportedException");
322 utf_java_lang_IllegalAccessException =
323 utf_new_char("java/lang/IllegalAccessException");
325 utf_java_lang_IllegalArgumentException =
326 utf_new_char("java/lang/IllegalArgumentException");
328 utf_java_lang_IllegalMonitorStateException =
329 utf_new_char("java/lang/IllegalMonitorStateException");
331 utf_java_lang_InstantiationException =
332 utf_new_char("java/lang/InstantiationException");
334 utf_java_lang_InterruptedException =
335 utf_new_char("java/lang/InterruptedException");
337 utf_java_lang_NegativeArraySizeException =
338 utf_new_char("java/lang/NegativeArraySizeException");
340 utf_java_lang_NullPointerException =
341 utf_new_char("java/lang/NullPointerException");
343 utf_java_lang_StringIndexOutOfBoundsException =
344 utf_new_char("java/lang/StringIndexOutOfBoundsException");
346 utf_java_lang_reflect_InvocationTargetException =
347 utf_new_char("java/lang/reflect/InvocationTargetException");
349 #if defined(ENABLE_JAVASE)
350 utf_java_lang_Void = utf_new_char("java/lang/Void");
353 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
354 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
355 utf_java_lang_Character = utf_new_char("java/lang/Character");
356 utf_java_lang_Short = utf_new_char("java/lang/Short");
357 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
358 utf_java_lang_Long = utf_new_char("java/lang/Long");
359 utf_java_lang_Float = utf_new_char("java/lang/Float");
360 utf_java_lang_Double = utf_new_char("java/lang/Double");
362 #if defined(ENABLE_JAVASE)
363 utf_java_lang_StackTraceElement =
364 utf_new_char("java/lang/StackTraceElement");
366 utf_java_lang_reflect_Constructor =
367 utf_new_char("java/lang/reflect/Constructor");
369 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
370 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
371 utf_java_util_Vector = utf_new_char("java/util/Vector");
374 utf_InnerClasses = utf_new_char("InnerClasses");
375 utf_ConstantValue = utf_new_char("ConstantValue");
376 utf_Code = utf_new_char("Code");
377 utf_Exceptions = utf_new_char("Exceptions");
378 utf_LineNumberTable = utf_new_char("LineNumberTable");
379 utf_SourceFile = utf_new_char("SourceFile");
381 #if defined(ENABLE_JAVASE)
382 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
383 utf_Signature = utf_new_char("Signature");
384 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
385 utf_StackMapTable = utf_new_char("StackMapTable");
388 utf_init = utf_new_char("<init>");
389 utf_clinit = utf_new_char("<clinit>");
390 utf_clone = utf_new_char("clone");
391 utf_finalize = utf_new_char("finalize");
392 utf_run = utf_new_char("run");
394 utf_add = utf_new_char("add");
395 utf_remove = utf_new_char("remove");
396 utf_addThread = utf_new_char("addThread");
397 utf_removeThread = utf_new_char("removeThread");
398 utf_put = utf_new_char("put");
399 utf_get = utf_new_char("get");
400 utf_value = utf_new_char("value");
402 utf_printStackTrace = utf_new_char("printStackTrace");
403 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
404 utf_loadClass = utf_new_char("loadClass");
405 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
407 utf_Z = utf_new_char("Z");
408 utf_B = utf_new_char("B");
409 utf_C = utf_new_char("C");
410 utf_S = utf_new_char("S");
411 utf_I = utf_new_char("I");
412 utf_J = utf_new_char("J");
413 utf_F = utf_new_char("F");
414 utf_D = utf_new_char("D");
416 utf_void__void = utf_new_char("()V");
417 utf_boolean__void = utf_new_char("(Z)V");
418 utf_byte__void = utf_new_char("(B)V");
419 utf_char__void = utf_new_char("(C)V");
420 utf_short__void = utf_new_char("(S)V");
421 utf_int__void = utf_new_char("(I)V");
422 utf_long__void = utf_new_char("(J)V");
423 utf_float__void = utf_new_char("(F)V");
424 utf_double__void = utf_new_char("(D)V");
425 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
426 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
428 utf_void__java_lang_ClassLoader =
429 utf_new_char("()Ljava/lang/ClassLoader;");
431 utf_java_lang_Object__java_lang_Object =
432 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
434 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
436 utf_java_lang_String__java_lang_Class =
437 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
439 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
440 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
442 utf_null = utf_new_char("null");
443 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
444 array_packagename = utf_new_char("\t<the array package>");
446 /* everything's ok */
452 /* utf_hashkey *****************************************************************
454 The hashkey is computed from the utf-text by using up to 8
455 characters. For utf-symbols longer than 15 characters 3 characters
456 are taken from the beginning and the end, 2 characters are taken
459 *******************************************************************************/
461 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
462 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
464 u4 utf_hashkey(const char *text, u4 length)
466 const char *start_pos = text; /* pointer to utf text */
470 case 0: /* empty string */
473 case 1: return fbs(0);
474 case 2: return fbs(0) ^ nbs(3);
475 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
476 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
477 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
478 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
479 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
480 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
487 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
496 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
505 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
517 return a ^ nbs(9) ^ nbs(10);
529 return a ^ nbs(9) ^ nbs(10);
540 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
551 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
553 default: /* 3 characters from beginning */
559 /* 2 characters from middle */
560 text = start_pos + (length / 2);
565 /* 3 characters from end */
566 text = start_pos + length - 4;
571 return a ^ nbs(10) ^ nbs(11);
575 /* utf_full_hashkey ************************************************************
577 This function computes a hash value using all bytes in the string.
579 The algorithm is the "One-at-a-time" algorithm as published
580 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
582 *******************************************************************************/
584 u4 utf_full_hashkey(const char *text, u4 length)
586 register const unsigned char *p = (const unsigned char *) text;
594 hash += (hash << 10);
598 hash ^= (hash >> 11);
599 hash += (hash << 15);
604 /* unicode_hashkey *************************************************************
606 Compute the hashkey of a unicode string.
608 *******************************************************************************/
610 u4 unicode_hashkey(u2 *text, u2 len)
612 return utf_hashkey((char *) text, len);
616 /* utf_new *********************************************************************
618 Creates a new utf-symbol, the text of the symbol is passed as a
619 u1-array. The function searches the utf-hashtable for a utf-symbol
620 with this text. On success the element returned, otherwise a new
621 hashtable element is created.
623 If the number of entries in the hashtable exceeds twice the size of
624 the hashtable slots a reorganization of the hashtable is done and
625 the utf symbols are copied to a new hashtable with doubled size.
627 *******************************************************************************/
629 utf *utf_new(const char *text, u2 length)
631 u4 key; /* hashkey computed from utf-text */
632 u4 slot; /* slot in hashtable */
633 utf *u; /* hashtable element */
636 LOCK_MONITOR_ENTER(hashtable_utf->header);
638 #if defined(ENABLE_STATISTICS)
643 key = utf_hashkey(text, length);
644 slot = key & (hashtable_utf->size - 1);
645 u = hashtable_utf->ptr[slot];
647 /* search external hash chain for utf-symbol */
650 if (u->blength == length) {
651 /* compare text of hashtable elements */
653 for (i = 0; i < length; i++)
654 if (text[i] != u->text[i])
657 #if defined(ENABLE_STATISTICS)
659 count_utf_new_found++;
662 /* symbol found in hashtable */
664 LOCK_MONITOR_EXIT(hashtable_utf->header);
670 u = u->hashlink; /* next element in external chain */
673 /* location in hashtable found, create new utf element */
677 u->blength = length; /* length in bytes of utfstring */
678 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
679 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
681 memcpy(u->text, text, length); /* copy utf-text */
682 u->text[length] = '\0';
684 #if defined(ENABLE_STATISTICS)
686 count_utf_len += sizeof(utf) + length + 1;
689 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
690 hashtable_utf->entries++; /* update number of entries */
692 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
694 /* reorganization of hashtable, average length of the external
695 chains is approx. 2 */
697 hashtable *newhash; /* the new hashtable */
703 /* create new hashtable, double the size */
705 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
707 #if defined(ENABLE_STATISTICS)
709 count_utf_len += sizeof(utf*) * hashtable_utf->size;
712 /* transfer elements to new hashtable */
714 for (i = 0; i < hashtable_utf->size; i++) {
715 u = hashtable_utf->ptr[i];
719 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
721 u->hashlink = (utf *) newhash->ptr[slot];
722 newhash->ptr[slot] = u;
724 /* follow link in external hash chain */
730 /* dispose old table */
732 hashtable_free(hashtable_utf);
734 hashtable_utf = newhash;
737 LOCK_MONITOR_EXIT(hashtable_utf->header);
743 /* utf_new_u2 ******************************************************************
745 Make utf symbol from u2 array, if isclassname is true '.' is
748 *******************************************************************************/
750 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
752 char *buffer; /* memory buffer for unicode characters */
753 char *pos; /* pointer to current position in buffer */
754 u4 left; /* unicode characters left */
755 u4 buflength; /* utf length in bytes of the u2 array */
756 utf *result; /* resulting utf-string */
759 /* determine utf length in bytes and allocate memory */
761 buflength = u2_utflength(unicode_pos, unicode_length);
762 buffer = MNEW(char, buflength);
767 for (i = 0; i++ < unicode_length; unicode_pos++) {
768 /* next unicode character */
771 if ((c != 0) && (c < 0x80)) {
774 if ((int) left < 0) break;
775 /* convert classname */
776 if (isclassname && c == '.')
781 } else if (c < 0x800) {
783 unsigned char high = c >> 6;
784 unsigned char low = c & 0x3F;
786 if ((int) left < 0) break;
787 *pos++ = high | 0xC0;
793 char mid = (c >> 6) & 0x3F;
796 if ((int) left < 0) break;
797 *pos++ = high | 0xE0;
803 /* insert utf-string into symbol-table */
804 result = utf_new(buffer,buflength);
806 MFREE(buffer, char, buflength);
812 /* utf_new_char ****************************************************************
814 Creates a new utf symbol, the text for this symbol is passed as a
815 c-string ( = char* ).
817 *******************************************************************************/
819 utf *utf_new_char(const char *text)
821 return utf_new(text, strlen(text));
825 /* utf_new_char_classname ******************************************************
827 Creates a new utf symbol, the text for this symbol is passed as a
828 c-string ( = char* ) "." characters are going to be replaced by
829 "/". Since the above function is used often, this is a separte
830 function, instead of an if.
832 *******************************************************************************/
834 utf *utf_new_char_classname(const char *text)
836 if (strchr(text, '.')) {
837 char *txt = strdup(text);
838 char *end = txt + strlen(txt);
842 for (c = txt; c < end; c++)
843 if (*c == '.') *c = '/';
845 tmpRes = utf_new(txt, strlen(txt));
851 return utf_new(text, strlen(text));
855 /* utf_nextu2 ******************************************************************
857 Read the next unicode character from the utf string and increment
858 the utf-string pointer accordingly.
860 CAUTION: This function is unsafe for input that was not checked
863 *******************************************************************************/
865 u2 utf_nextu2(char **utf_ptr)
867 /* uncompressed unicode character */
869 /* current position in utf text */
870 unsigned char *utf = (unsigned char *) (*utf_ptr);
871 /* bytes representing the unicode character */
872 unsigned char ch1, ch2, ch3;
873 /* number of bytes used to represent the unicode character */
876 switch ((ch1 = utf[0]) >> 4) {
877 default: /* 1 byte */
881 case 0xD: /* 2 bytes */
882 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
883 unsigned char high = ch1 & 0x1F;
884 unsigned char low = ch2 & 0x3F;
885 unicode_char = (high << 6) + low;
890 case 0xE: /* 2 or 3 bytes */
891 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
892 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
893 unsigned char low = ch3 & 0x3f;
894 unsigned char mid = ch2 & 0x3f;
895 unsigned char high = ch1 & 0x0f;
896 unicode_char = (((high << 6) + mid) << 6) + low;
904 /* update position in utf-text */
905 *utf_ptr = (char *) (utf + len);
911 /* utf_bytes *******************************************************************
913 Determine number of bytes (aka. octets) in the utf string.
916 u............utf string
919 The number of octets of this utf string.
920 There is _no_ terminating zero included in this count.
922 *******************************************************************************/
930 /* utf_get_number_of_u2s_for_buffer ********************************************
932 Determine number of UTF-16 u2s in the given UTF-8 buffer
934 CAUTION: This function is unsafe for input that was not checked
937 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
938 to an array of u2s (UTF-16) and want to know how many of them you will get.
939 All other uses of this function are probably wrong.
942 buffer........points to first char in buffer
943 blength.......number of _bytes_ in the buffer
946 the number of u2s needed to hold this string in UTF-16 encoding.
947 There is _no_ terminating zero included in this count.
949 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
952 *******************************************************************************/
954 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
956 const char *endpos; /* points behind utf string */
957 const char *utf_ptr; /* current position in utf text */
958 u4 len = 0; /* number of unicode characters */
961 endpos = utf_ptr + blength;
963 while (utf_ptr < endpos) {
965 /* next unicode character */
966 utf_nextu2((char **)&utf_ptr);
969 assert(utf_ptr == endpos);
975 /* utf_get_number_of_u2s *******************************************************
977 Determine number of UTF-16 u2s in the utf string.
979 CAUTION: This function is unsafe for input that was not checked
982 CAUTION: Use this function *only* when you want to convert a utf string
983 to an array of u2s and want to know how many of them you will get.
984 All other uses of this function are probably wrong.
987 u............utf string
990 the number of u2s needed to hold this string in UTF-16 encoding.
991 There is _no_ terminating zero included in this count.
992 XXX 0 if a NullPointerException has been thrown (see below)
994 *******************************************************************************/
996 u4 utf_get_number_of_u2s(utf *u)
998 char *endpos; /* points behind utf string */
999 char *utf_ptr; /* current position in utf text */
1000 u4 len = 0; /* number of unicode characters */
1002 /* XXX this is probably not checked by most callers! Review this after */
1003 /* the invalid uses of this function have been eliminated */
1005 exceptions_throw_nullpointerexception();
1009 endpos = UTF_END(u);
1012 while (utf_ptr < endpos) {
1014 /* next unicode character */
1015 utf_nextu2(&utf_ptr);
1018 if (utf_ptr != endpos) {
1019 /* string ended abruptly */
1020 exceptions_throw_internalerror("Illegal utf8 string");
1028 /* utf8_safe_number_of_u2s *****************************************************
1030 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1031 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1033 This function is safe even for invalid UTF-8 strings.
1036 text..........zero-terminated(!) UTF-8 string (may be invalid)
1038 nbytes........strlen(text). (This is needed to completely emulate
1042 the number of u2s needed to hold this string in UTF-16 encoding.
1043 There is _no_ terminating zero included in this count.
1045 *******************************************************************************/
1047 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1048 register const unsigned char *t;
1051 register const unsigned char *tlimit;
1059 assert(nbytes >= 0);
1062 t = (const unsigned char *) text;
1063 tlimit = t + nbytes;
1065 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1071 /* highest bit set, non-ASCII character */
1073 if ((byte & 0xe0) == 0xc0) {
1074 /* 2-byte: should be 110..... 10...... ? */
1076 if ((*t++ & 0xc0) == 0x80)
1077 ; /* valid 2-byte */
1081 else if ((byte & 0xf0) == 0xe0) {
1082 /* 3-byte: should be 1110.... 10...... 10...... */
1086 return len + 1; /* invalid, stop here */
1088 if ((*t++ & 0xc0) == 0x80) {
1089 if ((*t++ & 0xc0) == 0x80)
1090 ; /* valid 3-byte */
1097 else if ((byte & 0xf8) == 0xf0) {
1098 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1102 return len + 1; /* invalid, stop here */
1104 if (((byte1 = *t++) & 0xc0) == 0x80) {
1105 if (((byte2 = *t++) & 0xc0) == 0x80) {
1106 if (((byte3 = *t++) & 0xc0) == 0x80) {
1107 /* valid 4-byte UTF-8? */
1108 value = ((byte & 0x07) << 18)
1109 | ((byte1 & 0x3f) << 12)
1110 | ((byte2 & 0x3f) << 6)
1111 | ((byte3 & 0x3f) );
1113 if (value > 0x10FFFF)
1115 else if (value > 0xFFFF)
1116 len += 1; /* we need surrogates */
1118 ; /* 16bit suffice */
1129 else if ((byte & 0xfc) == 0xf8) {
1130 /* invalid 5-byte */
1132 return len + 1; /* invalid, stop here */
1135 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1138 else if ((byte & 0xfe) == 0xfc) {
1139 /* invalid 6-byte */
1141 return len + 1; /* invalid, stop here */
1144 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1156 /* ASCII character, common case */
1166 /* utf8_safe_convert_to_u2s ****************************************************
1168 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1169 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1170 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1172 This function is safe even for invalid UTF-8 strings.
1175 text..........zero-terminated(!) UTF-8 string (may be invalid)
1177 nbytes........strlen(text). (This is needed to completely emulate
1179 buffer........a preallocated array of u2s to receive the decoded
1180 string. Use utf8_safe_number_of_u2s to get the
1181 required number of u2s for allocating this.
1183 *******************************************************************************/
1185 #define UNICODE_REPLACEMENT 0xfffd
1187 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1188 register const unsigned char *t;
1190 register const unsigned char *tlimit;
1198 assert(nbytes >= 0);
1200 t = (const unsigned char *) text;
1201 tlimit = t + nbytes;
1203 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1209 /* highest bit set, non-ASCII character */
1211 if ((byte & 0xe0) == 0xc0) {
1212 /* 2-byte: should be 110..... 10...... */
1214 if (((byte1 = *t++) & 0xc0) == 0x80) {
1215 /* valid 2-byte UTF-8 */
1216 *buffer++ = ((byte & 0x1f) << 6)
1217 | ((byte1 & 0x3f) );
1220 *buffer++ = UNICODE_REPLACEMENT;
1224 else if ((byte & 0xf0) == 0xe0) {
1225 /* 3-byte: should be 1110.... 10...... 10...... */
1227 if (t + 2 > tlimit) {
1228 *buffer++ = UNICODE_REPLACEMENT;
1232 if (((byte1 = *t++) & 0xc0) == 0x80) {
1233 if (((byte2 = *t++) & 0xc0) == 0x80) {
1234 /* valid 3-byte UTF-8 */
1235 *buffer++ = ((byte & 0x0f) << 12)
1236 | ((byte1 & 0x3f) << 6)
1237 | ((byte2 & 0x3f) );
1240 *buffer++ = UNICODE_REPLACEMENT;
1245 *buffer++ = UNICODE_REPLACEMENT;
1249 else if ((byte & 0xf8) == 0xf0) {
1250 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1252 if (t + 3 > tlimit) {
1253 *buffer++ = UNICODE_REPLACEMENT;
1257 if (((byte1 = *t++) & 0xc0) == 0x80) {
1258 if (((byte2 = *t++) & 0xc0) == 0x80) {
1259 if (((byte3 = *t++) & 0xc0) == 0x80) {
1260 /* valid 4-byte UTF-8? */
1261 value = ((byte & 0x07) << 18)
1262 | ((byte1 & 0x3f) << 12)
1263 | ((byte2 & 0x3f) << 6)
1264 | ((byte3 & 0x3f) );
1266 if (value > 0x10FFFF) {
1267 *buffer++ = UNICODE_REPLACEMENT;
1269 else if (value > 0xFFFF) {
1270 /* we need surrogates */
1271 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1272 *buffer++ = 0xdc00 | (value & 0x03ff);
1275 *buffer++ = value; /* 16bit suffice */
1278 *buffer++ = UNICODE_REPLACEMENT;
1283 *buffer++ = UNICODE_REPLACEMENT;
1288 *buffer++ = UNICODE_REPLACEMENT;
1292 else if ((byte & 0xfc) == 0xf8) {
1293 if (t + 4 > tlimit) {
1294 *buffer++ = UNICODE_REPLACEMENT;
1299 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1301 *buffer++ = UNICODE_REPLACEMENT;
1303 else if ((byte & 0xfe) == 0xfc) {
1304 if (t + 5 > tlimit) {
1305 *buffer++ = UNICODE_REPLACEMENT;
1310 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1312 *buffer++ = UNICODE_REPLACEMENT;
1315 *buffer++ = UNICODE_REPLACEMENT;
1323 /* ASCII character, common case */
1331 /* u2_utflength ****************************************************************
1333 Returns the utf length in bytes of a u2 array.
1335 *******************************************************************************/
1337 u4 u2_utflength(u2 *text, u4 u2_length)
1339 u4 result_len = 0; /* utf length in bytes */
1340 u2 ch; /* current unicode character */
1343 for (len = 0; len < u2_length; len++) {
1344 /* next unicode character */
1347 /* determine bytes required to store unicode character as utf */
1348 if (ch && (ch < 0x80))
1350 else if (ch < 0x800)
1360 /* utf_copy ********************************************************************
1362 Copy the given utf string byte-for-byte to a buffer.
1365 buffer.......the buffer
1366 u............the utf string
1368 *******************************************************************************/
1370 void utf_copy(char *buffer, utf *u)
1372 /* our utf strings are zero-terminated (done by utf_new) */
1373 MCOPY(buffer, u->text, char, u->blength + 1);
1377 /* utf_cat *********************************************************************
1379 Append the given utf string byte-for-byte to a buffer.
1382 buffer.......the buffer
1383 u............the utf string
1385 *******************************************************************************/
1387 void utf_cat(char *buffer, utf *u)
1389 /* our utf strings are zero-terminated (done by utf_new) */
1390 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1394 /* utf_copy_classname **********************************************************
1396 Copy the given utf classname byte-for-byte to a buffer.
1397 '/' is replaced by '.'
1400 buffer.......the buffer
1401 u............the utf string
1403 *******************************************************************************/
1405 void utf_copy_classname(char *buffer, utf *u)
1414 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1416 while (srcptr != endptr) {
1425 /* utf_cat *********************************************************************
1427 Append the given utf classname byte-for-byte to a buffer.
1428 '/' is replaced by '.'
1431 buffer.......the buffer
1432 u............the utf string
1434 *******************************************************************************/
1436 void utf_cat_classname(char *buffer, utf *u)
1438 utf_copy_classname(buffer + strlen(buffer), u);
1441 /* utf_display_printable_ascii *************************************************
1443 Write utf symbol to stdout (for debugging purposes).
1444 Non-printable and non-ASCII characters are printed as '?'.
1446 *******************************************************************************/
1448 void utf_display_printable_ascii(utf *u)
1450 char *endpos; /* points behind utf string */
1451 char *utf_ptr; /* current position in utf text */
1459 endpos = UTF_END(u);
1462 while (utf_ptr < endpos) {
1463 /* read next unicode character */
1465 u2 c = utf_nextu2(&utf_ptr);
1467 if ((c >= 32) && (c <= 127))
1477 /* utf_display_printable_ascii_classname ***************************************
1479 Write utf symbol to stdout with `/' converted to `.' (for debugging
1481 Non-printable and non-ASCII characters are printed as '?'.
1483 *******************************************************************************/
1485 void utf_display_printable_ascii_classname(utf *u)
1487 char *endpos; /* points behind utf string */
1488 char *utf_ptr; /* current position in utf text */
1496 endpos = UTF_END(u);
1499 while (utf_ptr < endpos) {
1500 /* read next unicode character */
1502 u2 c = utf_nextu2(&utf_ptr);
1507 if ((c >= 32) && (c <= 127))
1517 /* utf_sprint_convert_to_latin1 ************************************************
1519 Write utf symbol into c-string (for debugging purposes).
1520 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1523 *******************************************************************************/
1525 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1527 char *endpos; /* points behind utf string */
1528 char *utf_ptr; /* current position in utf text */
1529 u2 pos = 0; /* position in c-string */
1532 strcpy(buffer, "NULL");
1536 endpos = UTF_END(u);
1539 while (utf_ptr < endpos)
1540 /* copy next unicode character */
1541 buffer[pos++] = utf_nextu2(&utf_ptr);
1543 /* terminate string */
1548 /* utf_sprint_convert_to_latin1_classname **************************************
1550 Write utf symbol into c-string with `/' converted to `.' (for debugging
1552 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1555 *******************************************************************************/
1557 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1559 char *endpos; /* points behind utf string */
1560 char *utf_ptr; /* current position in utf text */
1561 u2 pos = 0; /* position in c-string */
1564 strcpy(buffer, "NULL");
1568 endpos = UTF_END(u);
1571 while (utf_ptr < endpos) {
1572 /* copy next unicode character */
1573 u2 c = utf_nextu2(&utf_ptr);
1574 if (c == '/') c = '.';
1578 /* terminate string */
1583 /* utf_strcat_convert_to_latin1 ************************************************
1585 Like libc strcat, but uses an utf8 string.
1586 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1589 *******************************************************************************/
1591 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1593 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1597 /* utf_strcat_convert_to_latin1_classname **************************************
1599 Like libc strcat, but uses an utf8 string.
1600 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1603 *******************************************************************************/
1605 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1607 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1611 /* utf_fprint_printable_ascii **************************************************
1613 Write utf symbol into file.
1614 Non-printable and non-ASCII characters are printed as '?'.
1616 *******************************************************************************/
1618 void utf_fprint_printable_ascii(FILE *file, utf *u)
1620 char *endpos; /* points behind utf string */
1621 char *utf_ptr; /* current position in utf text */
1626 endpos = UTF_END(u);
1629 while (utf_ptr < endpos) {
1630 /* read next unicode character */
1631 u2 c = utf_nextu2(&utf_ptr);
1633 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1634 else fprintf(file, "?");
1639 /* utf_fprint_printable_ascii_classname ****************************************
1641 Write utf symbol into file with `/' converted to `.'.
1642 Non-printable and non-ASCII characters are printed as '?'.
1644 *******************************************************************************/
1646 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1648 char *endpos; /* points behind utf string */
1649 char *utf_ptr; /* current position in utf text */
1654 endpos = UTF_END(u);
1657 while (utf_ptr < endpos) {
1658 /* read next unicode character */
1659 u2 c = utf_nextu2(&utf_ptr);
1660 if (c == '/') c = '.';
1662 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1663 else fprintf(file, "?");
1668 /* is_valid_utf ****************************************************************
1670 Return true if the given string is a valid UTF-8 string.
1672 utf_ptr...points to first character
1673 end_pos...points after last character
1675 *******************************************************************************/
1677 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1679 bool is_valid_utf(char *utf_ptr, char *end_pos)
1686 if (end_pos < utf_ptr) return false;
1687 bytes = end_pos - utf_ptr;
1691 if (!c) return false; /* 0x00 is not allowed */
1692 if ((c & 0x80) == 0) continue; /* ASCII */
1694 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1695 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1696 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1697 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1698 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1699 else return false; /* invalid leading byte */
1701 if (len > 2) return false; /* Java limitation */
1703 v = (unsigned long)c & (0x3f >> len);
1705 if ((bytes -= len) < 0) return false; /* missing bytes */
1707 for (i = len; i--; ) {
1709 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1711 v = (v << 6) | (c & 0x3f);
1715 if (len != 1) return false; /* Java special */
1718 /* Sun Java seems to allow overlong UTF-8 encodings */
1720 /* if (v < min_codepoint[len]) */
1721 /* XXX throw exception? */
1724 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1725 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1727 /* even these seem to be allowed */
1728 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1735 /* is_valid_name ***************************************************************
1737 Return true if the given string may be used as a class/field/method
1738 name. (Currently this only disallows empty strings and control
1741 NOTE: The string is assumed to have passed is_valid_utf!
1743 utf_ptr...points to first character
1744 end_pos...points after last character
1746 *******************************************************************************/
1748 bool is_valid_name(char *utf_ptr, char *end_pos)
1750 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1752 while (utf_ptr < end_pos) {
1753 unsigned char c = *utf_ptr++;
1755 if (c < 0x20) return false; /* disallow control characters */
1756 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1763 bool is_valid_name_utf(utf *u)
1765 return is_valid_name(u->text, UTF_END(u));
1769 /* utf_show ********************************************************************
1771 Writes the utf symbols in the utfhash to stdout and displays the
1772 number of external hash chains grouped according to the chainlength
1773 (for debugging purposes).
1775 *******************************************************************************/
1777 #if !defined(NDEBUG)
1781 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1783 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1784 u4 max_chainlength = 0; /* maximum length of the chains */
1785 u4 sum_chainlength = 0; /* sum of the chainlengths */
1786 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1789 printf("UTF-HASH:\n");
1791 /* show element of utf-hashtable */
1793 for (i = 0; i < hashtable_utf->size; i++) {
1794 utf *u = hashtable_utf->ptr[i];
1797 printf("SLOT %d: ", (int) i);
1801 utf_display_printable_ascii(u);
1809 printf("UTF-HASH: %d slots for %d entries\n",
1810 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1812 if (hashtable_utf->entries == 0)
1815 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1817 for (i=0;i<CHAIN_LIMIT;i++)
1820 /* count numbers of hashchains according to their length */
1821 for (i=0; i<hashtable_utf->size; i++) {
1823 utf *u = (utf*) hashtable_utf->ptr[i];
1824 u4 chain_length = 0;
1826 /* determine chainlength */
1832 /* update sum of all chainlengths */
1833 sum_chainlength+=chain_length;
1835 /* determine the maximum length of the chains */
1836 if (chain_length>max_chainlength)
1837 max_chainlength = chain_length;
1839 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1840 if (chain_length>=CHAIN_LIMIT) {
1841 beyond_limit+=chain_length;
1842 chain_length=CHAIN_LIMIT-1;
1845 /* update number of hashchains of current length */
1846 chain_count[chain_length]++;
1849 /* display results */
1850 for (i=1;i<CHAIN_LIMIT-1;i++)
1851 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1853 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1856 printf("max. chainlength:%5d\n",max_chainlength);
1858 /* avg. chainlength = sum of chainlengths / number of chains */
1859 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1861 #endif /* !defined(NDEBUG) */
1865 * These are local overrides for various environment variables in Emacs.
1866 * Please do not remove this and leave it at the end of the file, where
1867 * Emacs will automagically detect them.
1868 * ---------------------------------------------------------------------
1871 * indent-tabs-mode: t
1875 * vim:noexpandtab:sw=4:ts=4: