1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007, 2008
4 CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
6 This file is part of CACAO.
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License as
10 published by the Free Software Foundation; either version 2, or (at
11 your option) any later version.
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "mm/memory.h"
35 #include "threads/lock-common.h"
37 #include "toolbox/hashtable.h"
39 #include "vm/exceptions.h"
41 #include "vmcore/options.h"
43 #if defined(ENABLE_STATISTICS)
44 # include "vmcore/statistics.h"
47 #include "vmcore/utf8.h"
50 /* global variables ***********************************************************/
52 /* hashsize must be power of 2 */
54 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
56 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
59 /* utf-symbols for pointer comparison of frequently used strings **************/
61 utf *utf_java_lang_Object;
63 utf *utf_java_lang_Class;
64 utf *utf_java_lang_ClassLoader;
65 utf *utf_java_lang_Cloneable;
66 utf *utf_java_lang_SecurityManager;
67 utf *utf_java_lang_String;
68 utf *utf_java_lang_ThreadGroup;
69 utf *utf_java_lang_ref_SoftReference;
70 utf *utf_java_lang_ref_WeakReference;
71 utf *utf_java_lang_ref_PhantomReference;
72 utf *utf_java_io_Serializable;
74 utf *utf_java_lang_Throwable;
75 utf *utf_java_lang_Error;
77 utf *utf_java_lang_AbstractMethodError;
78 utf *utf_java_lang_ClassCircularityError;
79 utf *utf_java_lang_ClassFormatError;
80 utf *utf_java_lang_ExceptionInInitializerError;
81 utf *utf_java_lang_IncompatibleClassChangeError;
82 utf *utf_java_lang_InstantiationError;
83 utf *utf_java_lang_InternalError;
84 utf *utf_java_lang_LinkageError;
85 utf *utf_java_lang_NoClassDefFoundError;
86 utf *utf_java_lang_NoSuchFieldError;
87 utf *utf_java_lang_NoSuchMethodError;
88 utf *utf_java_lang_OutOfMemoryError;
89 utf *utf_java_lang_UnsatisfiedLinkError;
90 utf *utf_java_lang_UnsupportedClassVersionError;
91 utf *utf_java_lang_VerifyError;
92 utf *utf_java_lang_VirtualMachineError;
94 utf *utf_java_lang_Exception;
96 utf *utf_java_lang_ArithmeticException;
97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
98 utf *utf_java_lang_ArrayStoreException;
99 utf *utf_java_lang_ClassCastException;
100 utf *utf_java_lang_ClassNotFoundException;
101 utf *utf_java_lang_CloneNotSupportedException;
102 utf *utf_java_lang_IllegalAccessException;
103 utf *utf_java_lang_IllegalArgumentException;
104 utf *utf_java_lang_IllegalMonitorStateException;
105 utf *utf_java_lang_InstantiationException;
106 utf *utf_java_lang_InterruptedException;
107 utf *utf_java_lang_NegativeArraySizeException;
108 utf *utf_java_lang_NullPointerException;
109 utf *utf_java_lang_RuntimeException;
110 utf *utf_java_lang_StringIndexOutOfBoundsException;
112 utf *utf_java_lang_reflect_InvocationTargetException;
114 utf *utf_java_security_PrivilegedActionException;
116 #if defined(ENABLE_JAVASE)
117 utf* utf_java_lang_Void;
120 utf* utf_java_lang_Boolean;
121 utf* utf_java_lang_Byte;
122 utf* utf_java_lang_Character;
123 utf* utf_java_lang_Short;
124 utf* utf_java_lang_Integer;
125 utf* utf_java_lang_Long;
126 utf* utf_java_lang_Float;
127 utf* utf_java_lang_Double;
129 #if defined(ENABLE_JAVASE)
130 utf *utf_java_lang_StackTraceElement;
131 utf *utf_java_lang_reflect_Constructor;
132 utf *utf_java_lang_reflect_Field;
133 utf *utf_java_lang_reflect_Method;
135 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
136 utf *utf_java_lang_reflect_VMConstructor;
137 utf *utf_java_lang_reflect_VMField;
138 utf *utf_java_lang_reflect_VMMethod;
141 utf *utf_java_util_Vector;
144 utf *utf_InnerClasses; /* InnerClasses */
145 utf *utf_ConstantValue; /* ConstantValue */
146 utf *utf_Code; /* Code */
147 utf *utf_Exceptions; /* Exceptions */
148 utf *utf_LineNumberTable; /* LineNumberTable */
149 utf *utf_SourceFile; /* SourceFile */
151 #if defined(ENABLE_JAVASE)
152 utf *utf_EnclosingMethod;
154 utf *utf_StackMapTable;
156 #if defined(ENABLE_ANNOTATIONS)
157 utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */
158 utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */
159 utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */
160 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
161 utf *utf_AnnotationDefault; /* AnnotationDefault */
165 utf *utf_init; /* <init> */
166 utf *utf_clinit; /* <clinit> */
167 utf *utf_clone; /* clone */
168 utf *utf_finalize; /* finalize */
171 utf *utf_run; /* run */
176 utf *utf_removeThread;
179 utf *utf_uncaughtException;
182 utf *utf_fillInStackTrace;
184 utf *utf_getSystemClassLoader;
187 utf *utf_loadClassInternal;
188 utf *utf_printStackTrace;
190 utf *utf_division_by_zero;
201 utf *utf_void__void; /* ()V */
202 utf *utf_boolean__void; /* (Z)V */
203 utf *utf_byte__void; /* (B)V */
204 utf *utf_char__void; /* (C)V */
205 utf *utf_short__void; /* (S)V */
206 utf *utf_int__void; /* (I)V */
207 utf *utf_long__void; /* (J)V */
208 utf *utf_float__void; /* (F)V */
209 utf *utf_double__void; /* (D)V */
211 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
212 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
213 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
214 utf *utf_java_lang_ClassLoader_java_lang_String__J;
215 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
216 utf *utf_java_lang_Object__java_lang_Object;
217 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
218 utf *utf_java_lang_String__java_lang_Class;
219 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
220 utf *utf_java_lang_Thread_java_lang_Throwable__V;
221 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
222 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
223 utf *utf_java_lang_Throwable__java_lang_Throwable;
225 utf *utf_not_named_yet; /* special name for unnamed classes */
227 utf *array_packagename;
230 /* utf_init ********************************************************************
232 Initializes the utf8 subsystem.
234 *******************************************************************************/
238 TRACESUBSYSTEMINITIALIZATION("utf8_init");
240 /* create utf8 hashtable */
242 hashtable_utf = NEW(hashtable);
244 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
246 #if defined(ENABLE_STATISTICS)
248 count_utf_len += sizeof(utf*) * hashtable_utf->size;
251 /* create utf-symbols for pointer comparison of frequently used strings */
253 utf_java_lang_Object = utf_new_char("java/lang/Object");
255 utf_java_lang_Class = utf_new_char("java/lang/Class");
256 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
257 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
258 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
259 utf_java_lang_String = utf_new_char("java/lang/String");
260 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
262 utf_java_lang_ref_SoftReference =
263 utf_new_char("java/lang/ref/SoftReference");
265 utf_java_lang_ref_WeakReference =
266 utf_new_char("java/lang/ref/WeakReference");
268 utf_java_lang_ref_PhantomReference =
269 utf_new_char("java/lang/ref/PhantomReference");
271 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
273 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
274 utf_java_lang_Error = utf_new_char("java/lang/Error");
276 utf_java_lang_ClassCircularityError =
277 utf_new_char("java/lang/ClassCircularityError");
279 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
281 utf_java_lang_ExceptionInInitializerError =
282 utf_new_char("java/lang/ExceptionInInitializerError");
284 utf_java_lang_IncompatibleClassChangeError =
285 utf_new_char("java/lang/IncompatibleClassChangeError");
287 utf_java_lang_InstantiationError =
288 utf_new_char("java/lang/InstantiationError");
290 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
291 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
293 utf_java_lang_NoClassDefFoundError =
294 utf_new_char("java/lang/NoClassDefFoundError");
296 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
298 utf_java_lang_UnsatisfiedLinkError =
299 utf_new_char("java/lang/UnsatisfiedLinkError");
301 utf_java_lang_UnsupportedClassVersionError =
302 utf_new_char("java/lang/UnsupportedClassVersionError");
304 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
306 utf_java_lang_VirtualMachineError =
307 utf_new_char("java/lang/VirtualMachineError");
309 #if defined(ENABLE_JAVASE)
310 utf_java_lang_AbstractMethodError =
311 utf_new_char("java/lang/AbstractMethodError");
313 utf_java_lang_NoSuchFieldError =
314 utf_new_char("java/lang/NoSuchFieldError");
316 utf_java_lang_NoSuchMethodError =
317 utf_new_char("java/lang/NoSuchMethodError");
320 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
322 utf_java_lang_ArithmeticException =
323 utf_new_char("java/lang/ArithmeticException");
325 utf_java_lang_ArrayIndexOutOfBoundsException =
326 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
328 utf_java_lang_ArrayStoreException =
329 utf_new_char("java/lang/ArrayStoreException");
331 utf_java_lang_ClassCastException =
332 utf_new_char("java/lang/ClassCastException");
334 utf_java_lang_ClassNotFoundException =
335 utf_new_char("java/lang/ClassNotFoundException");
337 utf_java_lang_CloneNotSupportedException =
338 utf_new_char("java/lang/CloneNotSupportedException");
340 utf_java_lang_IllegalAccessException =
341 utf_new_char("java/lang/IllegalAccessException");
343 utf_java_lang_IllegalArgumentException =
344 utf_new_char("java/lang/IllegalArgumentException");
346 utf_java_lang_IllegalMonitorStateException =
347 utf_new_char("java/lang/IllegalMonitorStateException");
349 utf_java_lang_InstantiationException =
350 utf_new_char("java/lang/InstantiationException");
352 utf_java_lang_InterruptedException =
353 utf_new_char("java/lang/InterruptedException");
355 utf_java_lang_NegativeArraySizeException =
356 utf_new_char("java/lang/NegativeArraySizeException");
358 utf_java_lang_NullPointerException =
359 utf_new_char("java/lang/NullPointerException");
361 utf_java_lang_RuntimeException =
362 utf_new_char("java/lang/RuntimeException");
364 utf_java_lang_StringIndexOutOfBoundsException =
365 utf_new_char("java/lang/StringIndexOutOfBoundsException");
367 utf_java_lang_reflect_InvocationTargetException =
368 utf_new_char("java/lang/reflect/InvocationTargetException");
370 utf_java_security_PrivilegedActionException =
371 utf_new_char("java/security/PrivilegedActionException");
373 #if defined(ENABLE_JAVASE)
374 utf_java_lang_Void = utf_new_char("java/lang/Void");
377 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
378 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
379 utf_java_lang_Character = utf_new_char("java/lang/Character");
380 utf_java_lang_Short = utf_new_char("java/lang/Short");
381 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
382 utf_java_lang_Long = utf_new_char("java/lang/Long");
383 utf_java_lang_Float = utf_new_char("java/lang/Float");
384 utf_java_lang_Double = utf_new_char("java/lang/Double");
386 #if defined(ENABLE_JAVASE)
387 utf_java_lang_StackTraceElement =
388 utf_new_char("java/lang/StackTraceElement");
390 utf_java_lang_reflect_Constructor =
391 utf_new_char("java/lang/reflect/Constructor");
393 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
394 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
396 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
397 utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
398 utf_java_lang_reflect_VMField = utf_new_char("java/lang/reflect/VMField");
399 utf_java_lang_reflect_VMMethod = utf_new_char("java/lang/reflect/VMMethod");
402 utf_java_util_Vector = utf_new_char("java/util/Vector");
405 utf_InnerClasses = utf_new_char("InnerClasses");
406 utf_ConstantValue = utf_new_char("ConstantValue");
407 utf_Code = utf_new_char("Code");
408 utf_Exceptions = utf_new_char("Exceptions");
409 utf_LineNumberTable = utf_new_char("LineNumberTable");
410 utf_SourceFile = utf_new_char("SourceFile");
412 #if defined(ENABLE_JAVASE)
413 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
414 utf_Signature = utf_new_char("Signature");
415 utf_StackMapTable = utf_new_char("StackMapTable");
417 # if defined(ENABLE_ANNOTATIONS)
418 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
419 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
420 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
421 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
422 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
426 utf_init = utf_new_char("<init>");
427 utf_clinit = utf_new_char("<clinit>");
428 utf_clone = utf_new_char("clone");
429 utf_finalize = utf_new_char("finalize");
430 utf_invoke = utf_new_char("invoke");
431 utf_main = utf_new_char("main");
432 utf_run = utf_new_char("run");
434 utf_add = utf_new_char("add");
435 utf_remove = utf_new_char("remove");
436 utf_addThread = utf_new_char("addThread");
437 utf_removeThread = utf_new_char("removeThread");
438 utf_put = utf_new_char("put");
439 utf_get = utf_new_char("get");
440 utf_uncaughtException = utf_new_char("uncaughtException");
441 utf_value = utf_new_char("value");
443 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
444 utf_findNative = utf_new_char("findNative");
445 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
446 utf_initCause = utf_new_char("initCause");
447 utf_loadClass = utf_new_char("loadClass");
448 utf_loadClassInternal = utf_new_char("loadClassInternal");
449 utf_printStackTrace = utf_new_char("printStackTrace");
451 utf_division_by_zero = utf_new_char("/ by zero");
453 utf_Z = utf_new_char("Z");
454 utf_B = utf_new_char("B");
455 utf_C = utf_new_char("C");
456 utf_S = utf_new_char("S");
457 utf_I = utf_new_char("I");
458 utf_J = utf_new_char("J");
459 utf_F = utf_new_char("F");
460 utf_D = utf_new_char("D");
462 utf_void__void = utf_new_char("()V");
463 utf_boolean__void = utf_new_char("(Z)V");
464 utf_byte__void = utf_new_char("(B)V");
465 utf_char__void = utf_new_char("(C)V");
466 utf_short__void = utf_new_char("(S)V");
467 utf_int__void = utf_new_char("(I)V");
468 utf_long__void = utf_new_char("(J)V");
469 utf_float__void = utf_new_char("(F)V");
470 utf_double__void = utf_new_char("(D)V");
471 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
472 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
474 utf_void__java_lang_ClassLoader =
475 utf_new_char("()Ljava/lang/ClassLoader;");
477 utf_java_lang_ClassLoader_java_lang_String__J =
478 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
480 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
482 utf_java_lang_Object__java_lang_Object =
483 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
485 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
487 utf_java_lang_String__java_lang_Class =
488 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
490 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
492 utf_java_lang_Thread_java_lang_Throwable__V =
493 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
495 utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
496 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
498 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
500 utf_java_lang_Throwable__java_lang_Throwable =
501 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
503 utf_null = utf_new_char("null");
504 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
505 array_packagename = utf_new_char("\t<the array package>");
509 /* utf_hashkey *****************************************************************
511 The hashkey is computed from the utf-text by using up to 8
512 characters. For utf-symbols longer than 15 characters 3 characters
513 are taken from the beginning and the end, 2 characters are taken
516 *******************************************************************************/
518 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
519 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
521 u4 utf_hashkey(const char *text, u4 length)
523 const char *start_pos = text; /* pointer to utf text */
527 case 0: /* empty string */
530 case 1: return fbs(0);
531 case 2: return fbs(0) ^ nbs(3);
532 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
533 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
534 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
535 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
536 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
537 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
544 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
553 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
562 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
574 return a ^ nbs(9) ^ nbs(10);
586 return a ^ nbs(9) ^ nbs(10);
597 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
608 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
610 default: /* 3 characters from beginning */
616 /* 2 characters from middle */
617 text = start_pos + (length / 2);
622 /* 3 characters from end */
623 text = start_pos + length - 4;
628 return a ^ nbs(10) ^ nbs(11);
632 /* utf_full_hashkey ************************************************************
634 This function computes a hash value using all bytes in the string.
636 The algorithm is the "One-at-a-time" algorithm as published
637 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
639 *******************************************************************************/
641 u4 utf_full_hashkey(const char *text, u4 length)
643 register const unsigned char *p = (const unsigned char *) text;
651 hash += (hash << 10);
655 hash ^= (hash >> 11);
656 hash += (hash << 15);
661 /* unicode_hashkey *************************************************************
663 Compute the hashkey of a unicode string.
665 *******************************************************************************/
667 u4 unicode_hashkey(u2 *text, u2 len)
669 return utf_hashkey((char *) text, len);
673 /* utf_new *********************************************************************
675 Creates a new utf-symbol, the text of the symbol is passed as a
676 u1-array. The function searches the utf-hashtable for a utf-symbol
677 with this text. On success the element returned, otherwise a new
678 hashtable element is created.
680 If the number of entries in the hashtable exceeds twice the size of
681 the hashtable slots a reorganization of the hashtable is done and
682 the utf symbols are copied to a new hashtable with doubled size.
684 *******************************************************************************/
686 utf *utf_new(const char *text, u2 length)
688 u4 key; /* hashkey computed from utf-text */
689 u4 slot; /* slot in hashtable */
690 utf *u; /* hashtable element */
693 LOCK_MONITOR_ENTER(hashtable_utf->header);
695 #if defined(ENABLE_STATISTICS)
700 key = utf_hashkey(text, length);
701 slot = key & (hashtable_utf->size - 1);
702 u = hashtable_utf->ptr[slot];
704 /* search external hash chain for utf-symbol */
707 if (u->blength == length) {
708 /* compare text of hashtable elements */
710 for (i = 0; i < length; i++)
711 if (text[i] != u->text[i])
714 #if defined(ENABLE_STATISTICS)
716 count_utf_new_found++;
719 /* symbol found in hashtable */
721 LOCK_MONITOR_EXIT(hashtable_utf->header);
727 u = u->hashlink; /* next element in external chain */
730 /* location in hashtable found, create new utf element */
734 u->blength = length; /* length in bytes of utfstring */
735 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
736 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
738 memcpy(u->text, text, length); /* copy utf-text */
739 u->text[length] = '\0';
741 #if defined(ENABLE_STATISTICS)
743 count_utf_len += sizeof(utf) + length + 1;
746 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
747 hashtable_utf->entries++; /* update number of entries */
749 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
751 /* reorganization of hashtable, average length of the external
752 chains is approx. 2 */
754 hashtable *newhash; /* the new hashtable */
760 /* create new hashtable, double the size */
762 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
764 #if defined(ENABLE_STATISTICS)
766 count_utf_len += sizeof(utf*) * hashtable_utf->size;
769 /* transfer elements to new hashtable */
771 for (i = 0; i < hashtable_utf->size; i++) {
772 u = hashtable_utf->ptr[i];
776 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
778 u->hashlink = (utf *) newhash->ptr[slot];
779 newhash->ptr[slot] = u;
781 /* follow link in external hash chain */
787 /* dispose old table */
789 hashtable_free(hashtable_utf);
791 hashtable_utf = newhash;
794 LOCK_MONITOR_EXIT(hashtable_utf->header);
800 /* utf_new_u2 ******************************************************************
802 Make utf symbol from u2 array, if isclassname is true '.' is
805 *******************************************************************************/
807 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
809 char *buffer; /* memory buffer for unicode characters */
810 char *pos; /* pointer to current position in buffer */
811 u4 left; /* unicode characters left */
812 u4 buflength; /* utf length in bytes of the u2 array */
813 utf *result; /* resulting utf-string */
816 /* determine utf length in bytes and allocate memory */
818 buflength = u2_utflength(unicode_pos, unicode_length);
819 buffer = MNEW(char, buflength);
824 for (i = 0; i++ < unicode_length; unicode_pos++) {
825 /* next unicode character */
828 if ((c != 0) && (c < 0x80)) {
831 if ((int) left < 0) break;
832 /* convert classname */
833 if (isclassname && c == '.')
838 } else if (c < 0x800) {
840 unsigned char high = c >> 6;
841 unsigned char low = c & 0x3F;
843 if ((int) left < 0) break;
844 *pos++ = high | 0xC0;
850 char mid = (c >> 6) & 0x3F;
853 if ((int) left < 0) break;
854 *pos++ = high | 0xE0;
860 /* insert utf-string into symbol-table */
861 result = utf_new(buffer,buflength);
863 MFREE(buffer, char, buflength);
869 /* utf_new_char ****************************************************************
871 Creates a new utf symbol, the text for this symbol is passed as a
872 c-string ( = char* ).
874 *******************************************************************************/
876 utf *utf_new_char(const char *text)
878 return utf_new(text, strlen(text));
882 /* utf_new_char_classname ******************************************************
884 Creates a new utf symbol, the text for this symbol is passed as a
885 c-string ( = char* ) "." characters are going to be replaced by
886 "/". Since the above function is used often, this is a separte
887 function, instead of an if.
889 *******************************************************************************/
891 utf *utf_new_char_classname(const char *text)
893 if (strchr(text, '.')) {
894 char *txt = strdup(text);
895 char *end = txt + strlen(txt);
899 for (c = txt; c < end; c++)
900 if (*c == '.') *c = '/';
902 tmpRes = utf_new(txt, strlen(txt));
908 return utf_new(text, strlen(text));
912 /* utf_nextu2 ******************************************************************
914 Read the next unicode character from the utf string and increment
915 the utf-string pointer accordingly.
917 CAUTION: This function is unsafe for input that was not checked
920 *******************************************************************************/
922 u2 utf_nextu2(char **utf_ptr)
924 /* uncompressed unicode character */
926 /* current position in utf text */
927 unsigned char *utf = (unsigned char *) (*utf_ptr);
928 /* bytes representing the unicode character */
929 unsigned char ch1, ch2, ch3;
930 /* number of bytes used to represent the unicode character */
933 switch ((ch1 = utf[0]) >> 4) {
934 default: /* 1 byte */
938 case 0xD: /* 2 bytes */
939 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
940 unsigned char high = ch1 & 0x1F;
941 unsigned char low = ch2 & 0x3F;
942 unicode_char = (high << 6) + low;
947 case 0xE: /* 2 or 3 bytes */
948 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
949 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
950 unsigned char low = ch3 & 0x3f;
951 unsigned char mid = ch2 & 0x3f;
952 unsigned char high = ch1 & 0x0f;
953 unicode_char = (((high << 6) + mid) << 6) + low;
961 /* update position in utf-text */
962 *utf_ptr = (char *) (utf + len);
968 /* utf_bytes *******************************************************************
970 Determine number of bytes (aka. octets) in the utf string.
973 u............utf string
976 The number of octets of this utf string.
977 There is _no_ terminating zero included in this count.
979 *******************************************************************************/
987 /* utf_get_number_of_u2s_for_buffer ********************************************
989 Determine number of UTF-16 u2s in the given UTF-8 buffer
991 CAUTION: This function is unsafe for input that was not checked
994 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
995 to an array of u2s (UTF-16) and want to know how many of them you will get.
996 All other uses of this function are probably wrong.
999 buffer........points to first char in buffer
1000 blength.......number of _bytes_ in the buffer
1003 the number of u2s needed to hold this string in UTF-16 encoding.
1004 There is _no_ terminating zero included in this count.
1006 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1009 *******************************************************************************/
1011 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1013 const char *endpos; /* points behind utf string */
1014 const char *utf_ptr; /* current position in utf text */
1015 u4 len = 0; /* number of unicode characters */
1018 endpos = utf_ptr + blength;
1020 while (utf_ptr < endpos) {
1022 /* next unicode character */
1023 utf_nextu2((char **)&utf_ptr);
1026 assert(utf_ptr == endpos);
1032 /* utf_get_number_of_u2s *******************************************************
1034 Determine number of UTF-16 u2s in the utf string.
1036 CAUTION: This function is unsafe for input that was not checked
1039 CAUTION: Use this function *only* when you want to convert a utf string
1040 to an array of u2s and want to know how many of them you will get.
1041 All other uses of this function are probably wrong.
1044 u............utf string
1047 the number of u2s needed to hold this string in UTF-16 encoding.
1048 There is _no_ terminating zero included in this count.
1049 XXX 0 if a NullPointerException has been thrown (see below)
1051 *******************************************************************************/
1053 u4 utf_get_number_of_u2s(utf *u)
1055 char *endpos; /* points behind utf string */
1056 char *utf_ptr; /* current position in utf text */
1057 u4 len = 0; /* number of unicode characters */
1059 /* XXX this is probably not checked by most callers! Review this after */
1060 /* the invalid uses of this function have been eliminated */
1062 exceptions_throw_nullpointerexception();
1066 endpos = UTF_END(u);
1069 while (utf_ptr < endpos) {
1071 /* next unicode character */
1072 utf_nextu2(&utf_ptr);
1075 if (utf_ptr != endpos) {
1076 /* string ended abruptly */
1077 exceptions_throw_internalerror("Illegal utf8 string");
1085 /* utf8_safe_number_of_u2s *****************************************************
1087 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1088 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1090 This function is safe even for invalid UTF-8 strings.
1093 text..........zero-terminated(!) UTF-8 string (may be invalid)
1095 nbytes........strlen(text). (This is needed to completely emulate
1099 the number of u2s needed to hold this string in UTF-16 encoding.
1100 There is _no_ terminating zero included in this count.
1102 *******************************************************************************/
1104 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1105 register const unsigned char *t;
1108 register const unsigned char *tlimit;
1116 assert(nbytes >= 0);
1119 t = (const unsigned char *) text;
1120 tlimit = t + nbytes;
1122 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1128 /* highest bit set, non-ASCII character */
1130 if ((byte & 0xe0) == 0xc0) {
1131 /* 2-byte: should be 110..... 10...... ? */
1133 if ((*t++ & 0xc0) == 0x80)
1134 ; /* valid 2-byte */
1138 else if ((byte & 0xf0) == 0xe0) {
1139 /* 3-byte: should be 1110.... 10...... 10...... */
1143 return len + 1; /* invalid, stop here */
1145 if ((*t++ & 0xc0) == 0x80) {
1146 if ((*t++ & 0xc0) == 0x80)
1147 ; /* valid 3-byte */
1154 else if ((byte & 0xf8) == 0xf0) {
1155 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1159 return len + 1; /* invalid, stop here */
1161 if (((byte1 = *t++) & 0xc0) == 0x80) {
1162 if (((byte2 = *t++) & 0xc0) == 0x80) {
1163 if (((byte3 = *t++) & 0xc0) == 0x80) {
1164 /* valid 4-byte UTF-8? */
1165 value = ((byte & 0x07) << 18)
1166 | ((byte1 & 0x3f) << 12)
1167 | ((byte2 & 0x3f) << 6)
1168 | ((byte3 & 0x3f) );
1170 if (value > 0x10FFFF)
1172 else if (value > 0xFFFF)
1173 len += 1; /* we need surrogates */
1175 ; /* 16bit suffice */
1186 else if ((byte & 0xfc) == 0xf8) {
1187 /* invalid 5-byte */
1189 return len + 1; /* invalid, stop here */
1192 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1195 else if ((byte & 0xfe) == 0xfc) {
1196 /* invalid 6-byte */
1198 return len + 1; /* invalid, stop here */
1201 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1213 /* ASCII character, common case */
1223 /* utf8_safe_convert_to_u2s ****************************************************
1225 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1226 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1227 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1229 This function is safe even for invalid UTF-8 strings.
1232 text..........zero-terminated(!) UTF-8 string (may be invalid)
1234 nbytes........strlen(text). (This is needed to completely emulate
1236 buffer........a preallocated array of u2s to receive the decoded
1237 string. Use utf8_safe_number_of_u2s to get the
1238 required number of u2s for allocating this.
1240 *******************************************************************************/
1242 #define UNICODE_REPLACEMENT 0xfffd
1244 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1245 register const unsigned char *t;
1247 register const unsigned char *tlimit;
1255 assert(nbytes >= 0);
1257 t = (const unsigned char *) text;
1258 tlimit = t + nbytes;
1260 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1266 /* highest bit set, non-ASCII character */
1268 if ((byte & 0xe0) == 0xc0) {
1269 /* 2-byte: should be 110..... 10...... */
1271 if (((byte1 = *t++) & 0xc0) == 0x80) {
1272 /* valid 2-byte UTF-8 */
1273 *buffer++ = ((byte & 0x1f) << 6)
1274 | ((byte1 & 0x3f) );
1277 *buffer++ = UNICODE_REPLACEMENT;
1281 else if ((byte & 0xf0) == 0xe0) {
1282 /* 3-byte: should be 1110.... 10...... 10...... */
1284 if (t + 2 > tlimit) {
1285 *buffer++ = UNICODE_REPLACEMENT;
1289 if (((byte1 = *t++) & 0xc0) == 0x80) {
1290 if (((byte2 = *t++) & 0xc0) == 0x80) {
1291 /* valid 3-byte UTF-8 */
1292 *buffer++ = ((byte & 0x0f) << 12)
1293 | ((byte1 & 0x3f) << 6)
1294 | ((byte2 & 0x3f) );
1297 *buffer++ = UNICODE_REPLACEMENT;
1302 *buffer++ = UNICODE_REPLACEMENT;
1306 else if ((byte & 0xf8) == 0xf0) {
1307 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1309 if (t + 3 > tlimit) {
1310 *buffer++ = UNICODE_REPLACEMENT;
1314 if (((byte1 = *t++) & 0xc0) == 0x80) {
1315 if (((byte2 = *t++) & 0xc0) == 0x80) {
1316 if (((byte3 = *t++) & 0xc0) == 0x80) {
1317 /* valid 4-byte UTF-8? */
1318 value = ((byte & 0x07) << 18)
1319 | ((byte1 & 0x3f) << 12)
1320 | ((byte2 & 0x3f) << 6)
1321 | ((byte3 & 0x3f) );
1323 if (value > 0x10FFFF) {
1324 *buffer++ = UNICODE_REPLACEMENT;
1326 else if (value > 0xFFFF) {
1327 /* we need surrogates */
1328 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1329 *buffer++ = 0xdc00 | (value & 0x03ff);
1332 *buffer++ = value; /* 16bit suffice */
1335 *buffer++ = UNICODE_REPLACEMENT;
1340 *buffer++ = UNICODE_REPLACEMENT;
1345 *buffer++ = UNICODE_REPLACEMENT;
1349 else if ((byte & 0xfc) == 0xf8) {
1350 if (t + 4 > tlimit) {
1351 *buffer++ = UNICODE_REPLACEMENT;
1356 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1358 *buffer++ = UNICODE_REPLACEMENT;
1360 else if ((byte & 0xfe) == 0xfc) {
1361 if (t + 5 > tlimit) {
1362 *buffer++ = UNICODE_REPLACEMENT;
1367 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1369 *buffer++ = UNICODE_REPLACEMENT;
1372 *buffer++ = UNICODE_REPLACEMENT;
1380 /* ASCII character, common case */
1388 /* u2_utflength ****************************************************************
1390 Returns the utf length in bytes of a u2 array.
1392 *******************************************************************************/
1394 u4 u2_utflength(u2 *text, u4 u2_length)
1396 u4 result_len = 0; /* utf length in bytes */
1397 u2 ch; /* current unicode character */
1400 for (len = 0; len < u2_length; len++) {
1401 /* next unicode character */
1404 /* determine bytes required to store unicode character as utf */
1405 if (ch && (ch < 0x80))
1407 else if (ch < 0x800)
1417 /* utf_copy ********************************************************************
1419 Copy the given utf string byte-for-byte to a buffer.
1422 buffer.......the buffer
1423 u............the utf string
1425 *******************************************************************************/
1427 void utf_copy(char *buffer, utf *u)
1429 /* our utf strings are zero-terminated (done by utf_new) */
1430 MCOPY(buffer, u->text, char, u->blength + 1);
1434 /* utf_cat *********************************************************************
1436 Append the given utf string byte-for-byte to a buffer.
1439 buffer.......the buffer
1440 u............the utf string
1442 *******************************************************************************/
1444 void utf_cat(char *buffer, utf *u)
1446 /* our utf strings are zero-terminated (done by utf_new) */
1447 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1451 /* utf_copy_classname **********************************************************
1453 Copy the given utf classname byte-for-byte to a buffer.
1454 '/' is replaced by '.'
1457 buffer.......the buffer
1458 u............the utf string
1460 *******************************************************************************/
1462 void utf_copy_classname(char *buffer, utf *u)
1471 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1473 while (srcptr != endptr) {
1482 /* utf_cat *********************************************************************
1484 Append the given utf classname byte-for-byte to a buffer.
1485 '/' is replaced by '.'
1488 buffer.......the buffer
1489 u............the utf string
1491 *******************************************************************************/
1493 void utf_cat_classname(char *buffer, utf *u)
1495 utf_copy_classname(buffer + strlen(buffer), u);
1498 /* utf_display_printable_ascii *************************************************
1500 Write utf symbol to stdout (for debugging purposes).
1501 Non-printable and non-ASCII characters are printed as '?'.
1503 *******************************************************************************/
1505 void utf_display_printable_ascii(utf *u)
1507 char *endpos; /* points behind utf string */
1508 char *utf_ptr; /* current position in utf text */
1516 endpos = UTF_END(u);
1519 while (utf_ptr < endpos) {
1520 /* read next unicode character */
1522 u2 c = utf_nextu2(&utf_ptr);
1524 if ((c >= 32) && (c <= 127))
1534 /* utf_display_printable_ascii_classname ***************************************
1536 Write utf symbol to stdout with `/' converted to `.' (for debugging
1538 Non-printable and non-ASCII characters are printed as '?'.
1540 *******************************************************************************/
1542 void utf_display_printable_ascii_classname(utf *u)
1544 char *endpos; /* points behind utf string */
1545 char *utf_ptr; /* current position in utf text */
1553 endpos = UTF_END(u);
1556 while (utf_ptr < endpos) {
1557 /* read next unicode character */
1559 u2 c = utf_nextu2(&utf_ptr);
1564 if ((c >= 32) && (c <= 127))
1574 /* utf_sprint_convert_to_latin1 ************************************************
1576 Write utf symbol into c-string (for debugging purposes).
1577 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1580 *******************************************************************************/
1582 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1584 char *endpos; /* points behind utf string */
1585 char *utf_ptr; /* current position in utf text */
1586 u2 pos = 0; /* position in c-string */
1589 strcpy(buffer, "NULL");
1593 endpos = UTF_END(u);
1596 while (utf_ptr < endpos)
1597 /* copy next unicode character */
1598 buffer[pos++] = utf_nextu2(&utf_ptr);
1600 /* terminate string */
1605 /* utf_sprint_convert_to_latin1_classname **************************************
1607 Write utf symbol into c-string with `/' converted to `.' (for debugging
1609 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1612 *******************************************************************************/
1614 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1616 char *endpos; /* points behind utf string */
1617 char *utf_ptr; /* current position in utf text */
1618 u2 pos = 0; /* position in c-string */
1621 strcpy(buffer, "NULL");
1625 endpos = UTF_END(u);
1628 while (utf_ptr < endpos) {
1629 /* copy next unicode character */
1630 u2 c = utf_nextu2(&utf_ptr);
1631 if (c == '/') c = '.';
1635 /* terminate string */
1640 /* utf_strcat_convert_to_latin1 ************************************************
1642 Like libc strcat, but uses an utf8 string.
1643 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1646 *******************************************************************************/
1648 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1650 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1654 /* utf_strcat_convert_to_latin1_classname **************************************
1656 Like libc strcat, but uses an utf8 string.
1657 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1660 *******************************************************************************/
1662 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1664 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1668 /* utf_fprint_printable_ascii **************************************************
1670 Write utf symbol into file.
1671 Non-printable and non-ASCII characters are printed as '?'.
1673 *******************************************************************************/
1675 void utf_fprint_printable_ascii(FILE *file, utf *u)
1677 char *endpos; /* points behind utf string */
1678 char *utf_ptr; /* current position in utf text */
1683 endpos = UTF_END(u);
1686 while (utf_ptr < endpos) {
1687 /* read next unicode character */
1688 u2 c = utf_nextu2(&utf_ptr);
1690 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1691 else fprintf(file, "?");
1696 /* utf_fprint_printable_ascii_classname ****************************************
1698 Write utf symbol into file with `/' converted to `.'.
1699 Non-printable and non-ASCII characters are printed as '?'.
1701 *******************************************************************************/
1703 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1705 char *endpos; /* points behind utf string */
1706 char *utf_ptr; /* current position in utf text */
1711 endpos = UTF_END(u);
1714 while (utf_ptr < endpos) {
1715 /* read next unicode character */
1716 u2 c = utf_nextu2(&utf_ptr);
1717 if (c == '/') c = '.';
1719 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1720 else fprintf(file, "?");
1725 /* is_valid_utf ****************************************************************
1727 Return true if the given string is a valid UTF-8 string.
1729 utf_ptr...points to first character
1730 end_pos...points after last character
1732 *******************************************************************************/
1734 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1736 bool is_valid_utf(char *utf_ptr, char *end_pos)
1743 if (end_pos < utf_ptr) return false;
1744 bytes = end_pos - utf_ptr;
1748 if (!c) return false; /* 0x00 is not allowed */
1749 if ((c & 0x80) == 0) continue; /* ASCII */
1751 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1752 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1753 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1754 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1755 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1756 else return false; /* invalid leading byte */
1758 if (len > 2) return false; /* Java limitation */
1760 v = (unsigned long)c & (0x3f >> len);
1762 if ((bytes -= len) < 0) return false; /* missing bytes */
1764 for (i = len; i--; ) {
1766 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1768 v = (v << 6) | (c & 0x3f);
1772 if (len != 1) return false; /* Java special */
1775 /* Sun Java seems to allow overlong UTF-8 encodings */
1777 /* if (v < min_codepoint[len]) */
1778 /* XXX throw exception? */
1781 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1782 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1784 /* even these seem to be allowed */
1785 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1792 /* is_valid_name ***************************************************************
1794 Return true if the given string may be used as a class/field/method
1795 name. (Currently this only disallows empty strings and control
1798 NOTE: The string is assumed to have passed is_valid_utf!
1800 utf_ptr...points to first character
1801 end_pos...points after last character
1803 *******************************************************************************/
1805 bool is_valid_name(char *utf_ptr, char *end_pos)
1807 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1809 while (utf_ptr < end_pos) {
1810 unsigned char c = *utf_ptr++;
1812 if (c < 0x20) return false; /* disallow control characters */
1813 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1820 bool is_valid_name_utf(utf *u)
1822 return is_valid_name(u->text, UTF_END(u));
1826 /* utf_show ********************************************************************
1828 Writes the utf symbols in the utfhash to stdout and displays the
1829 number of external hash chains grouped according to the chainlength
1830 (for debugging purposes).
1832 *******************************************************************************/
1834 #if !defined(NDEBUG)
1838 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1840 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1841 u4 max_chainlength = 0; /* maximum length of the chains */
1842 u4 sum_chainlength = 0; /* sum of the chainlengths */
1843 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1846 printf("UTF-HASH:\n");
1848 /* show element of utf-hashtable */
1850 for (i = 0; i < hashtable_utf->size; i++) {
1851 utf *u = hashtable_utf->ptr[i];
1854 printf("SLOT %d: ", (int) i);
1858 utf_display_printable_ascii(u);
1866 printf("UTF-HASH: %d slots for %d entries\n",
1867 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1869 if (hashtable_utf->entries == 0)
1872 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1874 for (i=0;i<CHAIN_LIMIT;i++)
1877 /* count numbers of hashchains according to their length */
1878 for (i=0; i<hashtable_utf->size; i++) {
1880 utf *u = (utf*) hashtable_utf->ptr[i];
1881 u4 chain_length = 0;
1883 /* determine chainlength */
1889 /* update sum of all chainlengths */
1890 sum_chainlength+=chain_length;
1892 /* determine the maximum length of the chains */
1893 if (chain_length>max_chainlength)
1894 max_chainlength = chain_length;
1896 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1897 if (chain_length>=CHAIN_LIMIT) {
1898 beyond_limit+=chain_length;
1899 chain_length=CHAIN_LIMIT-1;
1902 /* update number of hashchains of current length */
1903 chain_count[chain_length]++;
1906 /* display results */
1907 for (i=1;i<CHAIN_LIMIT-1;i++)
1908 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1910 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1913 printf("max. chainlength:%5d\n",max_chainlength);
1915 /* avg. chainlength = sum of chainlengths / number of chains */
1916 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1918 #endif /* !defined(NDEBUG) */
1922 * These are local overrides for various environment variables in Emacs.
1923 * Please do not remove this and leave it at the end of the file, where
1924 * Emacs will automagically detect them.
1925 * ---------------------------------------------------------------------
1928 * indent-tabs-mode: t
1932 * vim:noexpandtab:sw=4:ts=4: