1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
35 #include "mm/memory.h"
37 #include "threads/lock-common.h"
39 #include "toolbox/hashtable.h"
41 #include "vm/exceptions.h"
43 #include "vmcore/options.h"
45 #if defined(ENABLE_STATISTICS)
46 # include "vmcore/statistics.h"
49 #include "vmcore/utf8.h"
52 /* global variables ***********************************************************/
54 /* hashsize must be power of 2 */
56 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
58 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
61 /* utf-symbols for pointer comparison of frequently used strings **************/
63 utf *utf_java_lang_Object;
65 utf *utf_java_lang_Class;
66 utf *utf_java_lang_ClassLoader;
67 utf *utf_java_lang_Cloneable;
68 utf *utf_java_lang_SecurityManager;
69 utf *utf_java_lang_String;
70 utf *utf_java_lang_System;
71 utf *utf_java_lang_ThreadGroup;
72 utf *utf_java_lang_ref_SoftReference;
73 utf *utf_java_lang_ref_WeakReference;
74 utf *utf_java_lang_ref_PhantomReference;
75 utf *utf_java_io_Serializable;
77 utf *utf_java_lang_Throwable;
78 utf *utf_java_lang_Error;
80 utf *utf_java_lang_AbstractMethodError;
81 utf *utf_java_lang_ClassCircularityError;
82 utf *utf_java_lang_ClassFormatError;
83 utf *utf_java_lang_ExceptionInInitializerError;
84 utf *utf_java_lang_IncompatibleClassChangeError;
85 utf *utf_java_lang_InstantiationError;
86 utf *utf_java_lang_InternalError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoClassDefFoundError;
89 utf *utf_java_lang_NoSuchFieldError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
92 utf *utf_java_lang_UnsatisfiedLinkError;
93 utf *utf_java_lang_UnsupportedClassVersionError;
94 utf *utf_java_lang_VerifyError;
95 utf *utf_java_lang_VirtualMachineError;
97 #if defined(WITH_CLASSPATH_GNU)
98 utf *utf_java_lang_VMThrowable;
101 utf *utf_java_lang_Exception;
103 utf *utf_java_lang_ArithmeticException;
104 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
105 utf *utf_java_lang_ArrayStoreException;
106 utf *utf_java_lang_ClassCastException;
107 utf *utf_java_lang_ClassNotFoundException;
108 utf *utf_java_lang_CloneNotSupportedException;
109 utf *utf_java_lang_IllegalAccessException;
110 utf *utf_java_lang_IllegalArgumentException;
111 utf *utf_java_lang_IllegalMonitorStateException;
112 utf *utf_java_lang_InstantiationException;
113 utf *utf_java_lang_InterruptedException;
114 utf *utf_java_lang_NegativeArraySizeException;
115 utf *utf_java_lang_NullPointerException;
116 utf *utf_java_lang_StringIndexOutOfBoundsException;
118 utf *utf_java_lang_reflect_InvocationTargetException;
120 utf *utf_java_security_PrivilegedActionException;
122 #if defined(ENABLE_JAVASE)
123 utf* utf_java_lang_Void;
126 utf* utf_java_lang_Boolean;
127 utf* utf_java_lang_Byte;
128 utf* utf_java_lang_Character;
129 utf* utf_java_lang_Short;
130 utf* utf_java_lang_Integer;
131 utf* utf_java_lang_Long;
132 utf* utf_java_lang_Float;
133 utf* utf_java_lang_Double;
135 #if defined(ENABLE_JAVASE)
136 utf *utf_java_lang_StackTraceElement;
137 utf *utf_java_lang_reflect_Constructor;
138 utf *utf_java_lang_reflect_Field;
139 utf *utf_java_lang_reflect_Method;
140 utf *utf_java_util_Vector;
143 utf *utf_InnerClasses; /* InnerClasses */
144 utf *utf_ConstantValue; /* ConstantValue */
145 utf *utf_Code; /* Code */
146 utf *utf_Exceptions; /* Exceptions */
147 utf *utf_LineNumberTable; /* LineNumberTable */
148 utf *utf_SourceFile; /* SourceFile */
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
153 utf *utf_StackMapTable;
155 #if defined(ENABLE_ANNOTATIONS)
156 utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */
157 utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */
158 utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */
159 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
160 utf *utf_AnnotationDefault; /* AnnotationDefault */
164 utf *utf_init; /* <init> */
165 utf *utf_clinit; /* <clinit> */
166 utf *utf_clone; /* clone */
167 utf *utf_finalize; /* finalize */
168 utf *utf_run; /* run */
173 utf *utf_removeThread;
176 utf *utf_uncaughtException;
179 utf *utf_fillInStackTrace;
181 utf *utf_getSystemClassLoader;
184 utf *utf_loadClassInternal;
185 utf *utf_printStackTrace;
187 utf *utf_division_by_zero;
198 utf *utf_void__void; /* ()V */
199 utf *utf_boolean__void; /* (Z)V */
200 utf *utf_byte__void; /* (B)V */
201 utf *utf_char__void; /* (C)V */
202 utf *utf_short__void; /* (S)V */
203 utf *utf_int__void; /* (I)V */
204 utf *utf_long__void; /* (J)V */
205 utf *utf_float__void; /* (F)V */
206 utf *utf_double__void; /* (D)V */
208 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
209 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
210 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
211 utf *utf_java_lang_ClassLoader_java_lang_String__J;
212 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
213 utf *utf_java_lang_Object__java_lang_Object;
214 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
215 utf *utf_java_lang_String__java_lang_Class;
216 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
217 utf *utf_java_lang_Thread_java_lang_Throwable__V;
218 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
219 utf *utf_java_lang_Throwable__java_lang_Throwable;
221 utf *utf_not_named_yet; /* special name for unnamed classes */
223 utf *array_packagename;
226 /* utf_init ********************************************************************
228 Initializes the utf8 subsystem.
230 *******************************************************************************/
234 /* create utf8 hashtable */
236 hashtable_utf = NEW(hashtable);
238 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
240 #if defined(ENABLE_STATISTICS)
242 count_utf_len += sizeof(utf*) * hashtable_utf->size;
245 /* create utf-symbols for pointer comparison of frequently used strings */
247 utf_java_lang_Object = utf_new_char("java/lang/Object");
249 utf_java_lang_Class = utf_new_char("java/lang/Class");
250 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
251 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
252 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
253 utf_java_lang_String = utf_new_char("java/lang/String");
254 utf_java_lang_System = utf_new_char("java/lang/System");
255 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
257 utf_java_lang_ref_SoftReference =
258 utf_new_char("java/lang/ref/SoftReference");
260 utf_java_lang_ref_WeakReference =
261 utf_new_char("java/lang/ref/WeakReference");
263 utf_java_lang_ref_PhantomReference =
264 utf_new_char("java/lang/ref/PhantomReference");
266 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
268 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
269 utf_java_lang_Error = utf_new_char("java/lang/Error");
271 utf_java_lang_ClassCircularityError =
272 utf_new_char("java/lang/ClassCircularityError");
274 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
276 utf_java_lang_ExceptionInInitializerError =
277 utf_new_char("java/lang/ExceptionInInitializerError");
279 utf_java_lang_IncompatibleClassChangeError =
280 utf_new_char("java/lang/IncompatibleClassChangeError");
282 utf_java_lang_InstantiationError =
283 utf_new_char("java/lang/InstantiationError");
285 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
286 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
288 utf_java_lang_NoClassDefFoundError =
289 utf_new_char("java/lang/NoClassDefFoundError");
291 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
293 utf_java_lang_UnsatisfiedLinkError =
294 utf_new_char("java/lang/UnsatisfiedLinkError");
296 utf_java_lang_UnsupportedClassVersionError =
297 utf_new_char("java/lang/UnsupportedClassVersionError");
299 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
301 utf_java_lang_VirtualMachineError =
302 utf_new_char("java/lang/VirtualMachineError");
304 #if defined(ENABLE_JAVASE)
305 utf_java_lang_AbstractMethodError =
306 utf_new_char("java/lang/AbstractMethodError");
308 utf_java_lang_NoSuchFieldError =
309 utf_new_char("java/lang/NoSuchFieldError");
311 utf_java_lang_NoSuchMethodError =
312 utf_new_char("java/lang/NoSuchMethodError");
315 #if defined(WITH_CLASSPATH_GNU)
316 utf_java_lang_VMThrowable = utf_new_char("java/lang/VMThrowable");
319 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
321 utf_java_lang_ArithmeticException =
322 utf_new_char("java/lang/ArithmeticException");
324 utf_java_lang_ArrayIndexOutOfBoundsException =
325 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
327 utf_java_lang_ArrayStoreException =
328 utf_new_char("java/lang/ArrayStoreException");
330 utf_java_lang_ClassCastException =
331 utf_new_char("java/lang/ClassCastException");
333 utf_java_lang_ClassNotFoundException =
334 utf_new_char("java/lang/ClassNotFoundException");
336 utf_java_lang_CloneNotSupportedException =
337 utf_new_char("java/lang/CloneNotSupportedException");
339 utf_java_lang_IllegalAccessException =
340 utf_new_char("java/lang/IllegalAccessException");
342 utf_java_lang_IllegalArgumentException =
343 utf_new_char("java/lang/IllegalArgumentException");
345 utf_java_lang_IllegalMonitorStateException =
346 utf_new_char("java/lang/IllegalMonitorStateException");
348 utf_java_lang_InstantiationException =
349 utf_new_char("java/lang/InstantiationException");
351 utf_java_lang_InterruptedException =
352 utf_new_char("java/lang/InterruptedException");
354 utf_java_lang_NegativeArraySizeException =
355 utf_new_char("java/lang/NegativeArraySizeException");
357 utf_java_lang_NullPointerException =
358 utf_new_char("java/lang/NullPointerException");
360 utf_java_lang_StringIndexOutOfBoundsException =
361 utf_new_char("java/lang/StringIndexOutOfBoundsException");
363 utf_java_lang_reflect_InvocationTargetException =
364 utf_new_char("java/lang/reflect/InvocationTargetException");
366 utf_java_security_PrivilegedActionException =
367 utf_new_char("java/security/PrivilegedActionException");
369 #if defined(ENABLE_JAVASE)
370 utf_java_lang_Void = utf_new_char("java/lang/Void");
373 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
374 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
375 utf_java_lang_Character = utf_new_char("java/lang/Character");
376 utf_java_lang_Short = utf_new_char("java/lang/Short");
377 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
378 utf_java_lang_Long = utf_new_char("java/lang/Long");
379 utf_java_lang_Float = utf_new_char("java/lang/Float");
380 utf_java_lang_Double = utf_new_char("java/lang/Double");
382 #if defined(ENABLE_JAVASE)
383 utf_java_lang_StackTraceElement =
384 utf_new_char("java/lang/StackTraceElement");
386 utf_java_lang_reflect_Constructor =
387 utf_new_char("java/lang/reflect/Constructor");
389 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
390 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
391 utf_java_util_Vector = utf_new_char("java/util/Vector");
394 utf_InnerClasses = utf_new_char("InnerClasses");
395 utf_ConstantValue = utf_new_char("ConstantValue");
396 utf_Code = utf_new_char("Code");
397 utf_Exceptions = utf_new_char("Exceptions");
398 utf_LineNumberTable = utf_new_char("LineNumberTable");
399 utf_SourceFile = utf_new_char("SourceFile");
401 #if defined(ENABLE_JAVASE)
402 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
403 utf_Signature = utf_new_char("Signature");
404 utf_StackMapTable = utf_new_char("StackMapTable");
406 #if defined(ENABLE_ANNOTATIONS)
407 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
408 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
409 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
410 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
411 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
415 utf_init = utf_new_char("<init>");
416 utf_clinit = utf_new_char("<clinit>");
417 utf_clone = utf_new_char("clone");
418 utf_finalize = utf_new_char("finalize");
419 utf_run = utf_new_char("run");
421 utf_add = utf_new_char("add");
422 utf_remove = utf_new_char("remove");
423 utf_addThread = utf_new_char("addThread");
424 utf_removeThread = utf_new_char("removeThread");
425 utf_put = utf_new_char("put");
426 utf_get = utf_new_char("get");
427 utf_uncaughtException = utf_new_char("uncaughtException");
428 utf_value = utf_new_char("value");
430 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
431 utf_findNative = utf_new_char("findNative");
432 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
433 utf_initCause = utf_new_char("initCause");
434 utf_loadClass = utf_new_char("loadClass");
435 utf_loadClassInternal = utf_new_char("loadClassInternal");
436 utf_printStackTrace = utf_new_char("printStackTrace");
438 utf_division_by_zero = utf_new_char("/ by zero");
440 utf_Z = utf_new_char("Z");
441 utf_B = utf_new_char("B");
442 utf_C = utf_new_char("C");
443 utf_S = utf_new_char("S");
444 utf_I = utf_new_char("I");
445 utf_J = utf_new_char("J");
446 utf_F = utf_new_char("F");
447 utf_D = utf_new_char("D");
449 utf_void__void = utf_new_char("()V");
450 utf_boolean__void = utf_new_char("(Z)V");
451 utf_byte__void = utf_new_char("(B)V");
452 utf_char__void = utf_new_char("(C)V");
453 utf_short__void = utf_new_char("(S)V");
454 utf_int__void = utf_new_char("(I)V");
455 utf_long__void = utf_new_char("(J)V");
456 utf_float__void = utf_new_char("(F)V");
457 utf_double__void = utf_new_char("(D)V");
458 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
459 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
461 utf_void__java_lang_ClassLoader =
462 utf_new_char("()Ljava/lang/ClassLoader;");
464 utf_java_lang_ClassLoader_java_lang_String__J =
465 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
467 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
469 utf_java_lang_Object__java_lang_Object =
470 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
472 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
474 utf_java_lang_String__java_lang_Class =
475 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
477 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
479 utf_java_lang_Thread_java_lang_Throwable__V =
480 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
482 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
484 utf_java_lang_Throwable__java_lang_Throwable =
485 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
487 utf_null = utf_new_char("null");
488 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
489 array_packagename = utf_new_char("\t<the array package>");
491 /* everything's ok */
497 /* utf_hashkey *****************************************************************
499 The hashkey is computed from the utf-text by using up to 8
500 characters. For utf-symbols longer than 15 characters 3 characters
501 are taken from the beginning and the end, 2 characters are taken
504 *******************************************************************************/
506 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
507 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
509 u4 utf_hashkey(const char *text, u4 length)
511 const char *start_pos = text; /* pointer to utf text */
515 case 0: /* empty string */
518 case 1: return fbs(0);
519 case 2: return fbs(0) ^ nbs(3);
520 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
521 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
522 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
523 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
524 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
525 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
532 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
541 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
550 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
562 return a ^ nbs(9) ^ nbs(10);
574 return a ^ nbs(9) ^ nbs(10);
585 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
596 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
598 default: /* 3 characters from beginning */
604 /* 2 characters from middle */
605 text = start_pos + (length / 2);
610 /* 3 characters from end */
611 text = start_pos + length - 4;
616 return a ^ nbs(10) ^ nbs(11);
620 /* utf_full_hashkey ************************************************************
622 This function computes a hash value using all bytes in the string.
624 The algorithm is the "One-at-a-time" algorithm as published
625 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
627 *******************************************************************************/
629 u4 utf_full_hashkey(const char *text, u4 length)
631 register const unsigned char *p = (const unsigned char *) text;
639 hash += (hash << 10);
643 hash ^= (hash >> 11);
644 hash += (hash << 15);
649 /* unicode_hashkey *************************************************************
651 Compute the hashkey of a unicode string.
653 *******************************************************************************/
655 u4 unicode_hashkey(u2 *text, u2 len)
657 return utf_hashkey((char *) text, len);
661 /* utf_new *********************************************************************
663 Creates a new utf-symbol, the text of the symbol is passed as a
664 u1-array. The function searches the utf-hashtable for a utf-symbol
665 with this text. On success the element returned, otherwise a new
666 hashtable element is created.
668 If the number of entries in the hashtable exceeds twice the size of
669 the hashtable slots a reorganization of the hashtable is done and
670 the utf symbols are copied to a new hashtable with doubled size.
672 *******************************************************************************/
674 utf *utf_new(const char *text, u2 length)
676 u4 key; /* hashkey computed from utf-text */
677 u4 slot; /* slot in hashtable */
678 utf *u; /* hashtable element */
681 LOCK_MONITOR_ENTER(hashtable_utf->header);
683 #if defined(ENABLE_STATISTICS)
688 key = utf_hashkey(text, length);
689 slot = key & (hashtable_utf->size - 1);
690 u = hashtable_utf->ptr[slot];
692 /* search external hash chain for utf-symbol */
695 if (u->blength == length) {
696 /* compare text of hashtable elements */
698 for (i = 0; i < length; i++)
699 if (text[i] != u->text[i])
702 #if defined(ENABLE_STATISTICS)
704 count_utf_new_found++;
707 /* symbol found in hashtable */
709 LOCK_MONITOR_EXIT(hashtable_utf->header);
715 u = u->hashlink; /* next element in external chain */
718 /* location in hashtable found, create new utf element */
722 u->blength = length; /* length in bytes of utfstring */
723 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
724 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
726 memcpy(u->text, text, length); /* copy utf-text */
727 u->text[length] = '\0';
729 #if defined(ENABLE_STATISTICS)
731 count_utf_len += sizeof(utf) + length + 1;
734 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
735 hashtable_utf->entries++; /* update number of entries */
737 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
739 /* reorganization of hashtable, average length of the external
740 chains is approx. 2 */
742 hashtable *newhash; /* the new hashtable */
748 /* create new hashtable, double the size */
750 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
752 #if defined(ENABLE_STATISTICS)
754 count_utf_len += sizeof(utf*) * hashtable_utf->size;
757 /* transfer elements to new hashtable */
759 for (i = 0; i < hashtable_utf->size; i++) {
760 u = hashtable_utf->ptr[i];
764 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
766 u->hashlink = (utf *) newhash->ptr[slot];
767 newhash->ptr[slot] = u;
769 /* follow link in external hash chain */
775 /* dispose old table */
777 hashtable_free(hashtable_utf);
779 hashtable_utf = newhash;
782 LOCK_MONITOR_EXIT(hashtable_utf->header);
788 /* utf_new_u2 ******************************************************************
790 Make utf symbol from u2 array, if isclassname is true '.' is
793 *******************************************************************************/
795 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
797 char *buffer; /* memory buffer for unicode characters */
798 char *pos; /* pointer to current position in buffer */
799 u4 left; /* unicode characters left */
800 u4 buflength; /* utf length in bytes of the u2 array */
801 utf *result; /* resulting utf-string */
804 /* determine utf length in bytes and allocate memory */
806 buflength = u2_utflength(unicode_pos, unicode_length);
807 buffer = MNEW(char, buflength);
812 for (i = 0; i++ < unicode_length; unicode_pos++) {
813 /* next unicode character */
816 if ((c != 0) && (c < 0x80)) {
819 if ((int) left < 0) break;
820 /* convert classname */
821 if (isclassname && c == '.')
826 } else if (c < 0x800) {
828 unsigned char high = c >> 6;
829 unsigned char low = c & 0x3F;
831 if ((int) left < 0) break;
832 *pos++ = high | 0xC0;
838 char mid = (c >> 6) & 0x3F;
841 if ((int) left < 0) break;
842 *pos++ = high | 0xE0;
848 /* insert utf-string into symbol-table */
849 result = utf_new(buffer,buflength);
851 MFREE(buffer, char, buflength);
857 /* utf_new_char ****************************************************************
859 Creates a new utf symbol, the text for this symbol is passed as a
860 c-string ( = char* ).
862 *******************************************************************************/
864 utf *utf_new_char(const char *text)
866 return utf_new(text, strlen(text));
870 /* utf_new_char_classname ******************************************************
872 Creates a new utf symbol, the text for this symbol is passed as a
873 c-string ( = char* ) "." characters are going to be replaced by
874 "/". Since the above function is used often, this is a separte
875 function, instead of an if.
877 *******************************************************************************/
879 utf *utf_new_char_classname(const char *text)
881 if (strchr(text, '.')) {
882 char *txt = strdup(text);
883 char *end = txt + strlen(txt);
887 for (c = txt; c < end; c++)
888 if (*c == '.') *c = '/';
890 tmpRes = utf_new(txt, strlen(txt));
896 return utf_new(text, strlen(text));
900 /* utf_nextu2 ******************************************************************
902 Read the next unicode character from the utf string and increment
903 the utf-string pointer accordingly.
905 CAUTION: This function is unsafe for input that was not checked
908 *******************************************************************************/
910 u2 utf_nextu2(char **utf_ptr)
912 /* uncompressed unicode character */
914 /* current position in utf text */
915 unsigned char *utf = (unsigned char *) (*utf_ptr);
916 /* bytes representing the unicode character */
917 unsigned char ch1, ch2, ch3;
918 /* number of bytes used to represent the unicode character */
921 switch ((ch1 = utf[0]) >> 4) {
922 default: /* 1 byte */
926 case 0xD: /* 2 bytes */
927 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
928 unsigned char high = ch1 & 0x1F;
929 unsigned char low = ch2 & 0x3F;
930 unicode_char = (high << 6) + low;
935 case 0xE: /* 2 or 3 bytes */
936 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
937 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
938 unsigned char low = ch3 & 0x3f;
939 unsigned char mid = ch2 & 0x3f;
940 unsigned char high = ch1 & 0x0f;
941 unicode_char = (((high << 6) + mid) << 6) + low;
949 /* update position in utf-text */
950 *utf_ptr = (char *) (utf + len);
956 /* utf_bytes *******************************************************************
958 Determine number of bytes (aka. octets) in the utf string.
961 u............utf string
964 The number of octets of this utf string.
965 There is _no_ terminating zero included in this count.
967 *******************************************************************************/
975 /* utf_get_number_of_u2s_for_buffer ********************************************
977 Determine number of UTF-16 u2s in the given UTF-8 buffer
979 CAUTION: This function is unsafe for input that was not checked
982 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
983 to an array of u2s (UTF-16) and want to know how many of them you will get.
984 All other uses of this function are probably wrong.
987 buffer........points to first char in buffer
988 blength.......number of _bytes_ in the buffer
991 the number of u2s needed to hold this string in UTF-16 encoding.
992 There is _no_ terminating zero included in this count.
994 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
997 *******************************************************************************/
999 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1001 const char *endpos; /* points behind utf string */
1002 const char *utf_ptr; /* current position in utf text */
1003 u4 len = 0; /* number of unicode characters */
1006 endpos = utf_ptr + blength;
1008 while (utf_ptr < endpos) {
1010 /* next unicode character */
1011 utf_nextu2((char **)&utf_ptr);
1014 assert(utf_ptr == endpos);
1020 /* utf_get_number_of_u2s *******************************************************
1022 Determine number of UTF-16 u2s in the utf string.
1024 CAUTION: This function is unsafe for input that was not checked
1027 CAUTION: Use this function *only* when you want to convert a utf string
1028 to an array of u2s and want to know how many of them you will get.
1029 All other uses of this function are probably wrong.
1032 u............utf string
1035 the number of u2s needed to hold this string in UTF-16 encoding.
1036 There is _no_ terminating zero included in this count.
1037 XXX 0 if a NullPointerException has been thrown (see below)
1039 *******************************************************************************/
1041 u4 utf_get_number_of_u2s(utf *u)
1043 char *endpos; /* points behind utf string */
1044 char *utf_ptr; /* current position in utf text */
1045 u4 len = 0; /* number of unicode characters */
1047 /* XXX this is probably not checked by most callers! Review this after */
1048 /* the invalid uses of this function have been eliminated */
1050 exceptions_throw_nullpointerexception();
1054 endpos = UTF_END(u);
1057 while (utf_ptr < endpos) {
1059 /* next unicode character */
1060 utf_nextu2(&utf_ptr);
1063 if (utf_ptr != endpos) {
1064 /* string ended abruptly */
1065 exceptions_throw_internalerror("Illegal utf8 string");
1073 /* utf8_safe_number_of_u2s *****************************************************
1075 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1076 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1078 This function is safe even for invalid UTF-8 strings.
1081 text..........zero-terminated(!) UTF-8 string (may be invalid)
1083 nbytes........strlen(text). (This is needed to completely emulate
1087 the number of u2s needed to hold this string in UTF-16 encoding.
1088 There is _no_ terminating zero included in this count.
1090 *******************************************************************************/
1092 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1093 register const unsigned char *t;
1096 register const unsigned char *tlimit;
1104 assert(nbytes >= 0);
1107 t = (const unsigned char *) text;
1108 tlimit = t + nbytes;
1110 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1116 /* highest bit set, non-ASCII character */
1118 if ((byte & 0xe0) == 0xc0) {
1119 /* 2-byte: should be 110..... 10...... ? */
1121 if ((*t++ & 0xc0) == 0x80)
1122 ; /* valid 2-byte */
1126 else if ((byte & 0xf0) == 0xe0) {
1127 /* 3-byte: should be 1110.... 10...... 10...... */
1131 return len + 1; /* invalid, stop here */
1133 if ((*t++ & 0xc0) == 0x80) {
1134 if ((*t++ & 0xc0) == 0x80)
1135 ; /* valid 3-byte */
1142 else if ((byte & 0xf8) == 0xf0) {
1143 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1147 return len + 1; /* invalid, stop here */
1149 if (((byte1 = *t++) & 0xc0) == 0x80) {
1150 if (((byte2 = *t++) & 0xc0) == 0x80) {
1151 if (((byte3 = *t++) & 0xc0) == 0x80) {
1152 /* valid 4-byte UTF-8? */
1153 value = ((byte & 0x07) << 18)
1154 | ((byte1 & 0x3f) << 12)
1155 | ((byte2 & 0x3f) << 6)
1156 | ((byte3 & 0x3f) );
1158 if (value > 0x10FFFF)
1160 else if (value > 0xFFFF)
1161 len += 1; /* we need surrogates */
1163 ; /* 16bit suffice */
1174 else if ((byte & 0xfc) == 0xf8) {
1175 /* invalid 5-byte */
1177 return len + 1; /* invalid, stop here */
1180 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1183 else if ((byte & 0xfe) == 0xfc) {
1184 /* invalid 6-byte */
1186 return len + 1; /* invalid, stop here */
1189 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1201 /* ASCII character, common case */
1211 /* utf8_safe_convert_to_u2s ****************************************************
1213 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1214 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1215 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1217 This function is safe even for invalid UTF-8 strings.
1220 text..........zero-terminated(!) UTF-8 string (may be invalid)
1222 nbytes........strlen(text). (This is needed to completely emulate
1224 buffer........a preallocated array of u2s to receive the decoded
1225 string. Use utf8_safe_number_of_u2s to get the
1226 required number of u2s for allocating this.
1228 *******************************************************************************/
1230 #define UNICODE_REPLACEMENT 0xfffd
1232 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1233 register const unsigned char *t;
1235 register const unsigned char *tlimit;
1243 assert(nbytes >= 0);
1245 t = (const unsigned char *) text;
1246 tlimit = t + nbytes;
1248 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1254 /* highest bit set, non-ASCII character */
1256 if ((byte & 0xe0) == 0xc0) {
1257 /* 2-byte: should be 110..... 10...... */
1259 if (((byte1 = *t++) & 0xc0) == 0x80) {
1260 /* valid 2-byte UTF-8 */
1261 *buffer++ = ((byte & 0x1f) << 6)
1262 | ((byte1 & 0x3f) );
1265 *buffer++ = UNICODE_REPLACEMENT;
1269 else if ((byte & 0xf0) == 0xe0) {
1270 /* 3-byte: should be 1110.... 10...... 10...... */
1272 if (t + 2 > tlimit) {
1273 *buffer++ = UNICODE_REPLACEMENT;
1277 if (((byte1 = *t++) & 0xc0) == 0x80) {
1278 if (((byte2 = *t++) & 0xc0) == 0x80) {
1279 /* valid 3-byte UTF-8 */
1280 *buffer++ = ((byte & 0x0f) << 12)
1281 | ((byte1 & 0x3f) << 6)
1282 | ((byte2 & 0x3f) );
1285 *buffer++ = UNICODE_REPLACEMENT;
1290 *buffer++ = UNICODE_REPLACEMENT;
1294 else if ((byte & 0xf8) == 0xf0) {
1295 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1297 if (t + 3 > tlimit) {
1298 *buffer++ = UNICODE_REPLACEMENT;
1302 if (((byte1 = *t++) & 0xc0) == 0x80) {
1303 if (((byte2 = *t++) & 0xc0) == 0x80) {
1304 if (((byte3 = *t++) & 0xc0) == 0x80) {
1305 /* valid 4-byte UTF-8? */
1306 value = ((byte & 0x07) << 18)
1307 | ((byte1 & 0x3f) << 12)
1308 | ((byte2 & 0x3f) << 6)
1309 | ((byte3 & 0x3f) );
1311 if (value > 0x10FFFF) {
1312 *buffer++ = UNICODE_REPLACEMENT;
1314 else if (value > 0xFFFF) {
1315 /* we need surrogates */
1316 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1317 *buffer++ = 0xdc00 | (value & 0x03ff);
1320 *buffer++ = value; /* 16bit suffice */
1323 *buffer++ = UNICODE_REPLACEMENT;
1328 *buffer++ = UNICODE_REPLACEMENT;
1333 *buffer++ = UNICODE_REPLACEMENT;
1337 else if ((byte & 0xfc) == 0xf8) {
1338 if (t + 4 > tlimit) {
1339 *buffer++ = UNICODE_REPLACEMENT;
1344 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1346 *buffer++ = UNICODE_REPLACEMENT;
1348 else if ((byte & 0xfe) == 0xfc) {
1349 if (t + 5 > tlimit) {
1350 *buffer++ = UNICODE_REPLACEMENT;
1355 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1357 *buffer++ = UNICODE_REPLACEMENT;
1360 *buffer++ = UNICODE_REPLACEMENT;
1368 /* ASCII character, common case */
1376 /* u2_utflength ****************************************************************
1378 Returns the utf length in bytes of a u2 array.
1380 *******************************************************************************/
1382 u4 u2_utflength(u2 *text, u4 u2_length)
1384 u4 result_len = 0; /* utf length in bytes */
1385 u2 ch; /* current unicode character */
1388 for (len = 0; len < u2_length; len++) {
1389 /* next unicode character */
1392 /* determine bytes required to store unicode character as utf */
1393 if (ch && (ch < 0x80))
1395 else if (ch < 0x800)
1405 /* utf_copy ********************************************************************
1407 Copy the given utf string byte-for-byte to a buffer.
1410 buffer.......the buffer
1411 u............the utf string
1413 *******************************************************************************/
1415 void utf_copy(char *buffer, utf *u)
1417 /* our utf strings are zero-terminated (done by utf_new) */
1418 MCOPY(buffer, u->text, char, u->blength + 1);
1422 /* utf_cat *********************************************************************
1424 Append the given utf string byte-for-byte to a buffer.
1427 buffer.......the buffer
1428 u............the utf string
1430 *******************************************************************************/
1432 void utf_cat(char *buffer, utf *u)
1434 /* our utf strings are zero-terminated (done by utf_new) */
1435 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1439 /* utf_copy_classname **********************************************************
1441 Copy the given utf classname byte-for-byte to a buffer.
1442 '/' is replaced by '.'
1445 buffer.......the buffer
1446 u............the utf string
1448 *******************************************************************************/
1450 void utf_copy_classname(char *buffer, utf *u)
1459 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1461 while (srcptr != endptr) {
1470 /* utf_cat *********************************************************************
1472 Append the given utf classname byte-for-byte to a buffer.
1473 '/' is replaced by '.'
1476 buffer.......the buffer
1477 u............the utf string
1479 *******************************************************************************/
1481 void utf_cat_classname(char *buffer, utf *u)
1483 utf_copy_classname(buffer + strlen(buffer), u);
1486 /* utf_display_printable_ascii *************************************************
1488 Write utf symbol to stdout (for debugging purposes).
1489 Non-printable and non-ASCII characters are printed as '?'.
1491 *******************************************************************************/
1493 void utf_display_printable_ascii(utf *u)
1495 char *endpos; /* points behind utf string */
1496 char *utf_ptr; /* current position in utf text */
1504 endpos = UTF_END(u);
1507 while (utf_ptr < endpos) {
1508 /* read next unicode character */
1510 u2 c = utf_nextu2(&utf_ptr);
1512 if ((c >= 32) && (c <= 127))
1522 /* utf_display_printable_ascii_classname ***************************************
1524 Write utf symbol to stdout with `/' converted to `.' (for debugging
1526 Non-printable and non-ASCII characters are printed as '?'.
1528 *******************************************************************************/
1530 void utf_display_printable_ascii_classname(utf *u)
1532 char *endpos; /* points behind utf string */
1533 char *utf_ptr; /* current position in utf text */
1541 endpos = UTF_END(u);
1544 while (utf_ptr < endpos) {
1545 /* read next unicode character */
1547 u2 c = utf_nextu2(&utf_ptr);
1552 if ((c >= 32) && (c <= 127))
1562 /* utf_sprint_convert_to_latin1 ************************************************
1564 Write utf symbol into c-string (for debugging purposes).
1565 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1568 *******************************************************************************/
1570 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1572 char *endpos; /* points behind utf string */
1573 char *utf_ptr; /* current position in utf text */
1574 u2 pos = 0; /* position in c-string */
1577 strcpy(buffer, "NULL");
1581 endpos = UTF_END(u);
1584 while (utf_ptr < endpos)
1585 /* copy next unicode character */
1586 buffer[pos++] = utf_nextu2(&utf_ptr);
1588 /* terminate string */
1593 /* utf_sprint_convert_to_latin1_classname **************************************
1595 Write utf symbol into c-string with `/' converted to `.' (for debugging
1597 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1600 *******************************************************************************/
1602 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1604 char *endpos; /* points behind utf string */
1605 char *utf_ptr; /* current position in utf text */
1606 u2 pos = 0; /* position in c-string */
1609 strcpy(buffer, "NULL");
1613 endpos = UTF_END(u);
1616 while (utf_ptr < endpos) {
1617 /* copy next unicode character */
1618 u2 c = utf_nextu2(&utf_ptr);
1619 if (c == '/') c = '.';
1623 /* terminate string */
1628 /* utf_strcat_convert_to_latin1 ************************************************
1630 Like libc strcat, but uses an utf8 string.
1631 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1634 *******************************************************************************/
1636 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1638 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1642 /* utf_strcat_convert_to_latin1_classname **************************************
1644 Like libc strcat, but uses an utf8 string.
1645 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1648 *******************************************************************************/
1650 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1652 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1656 /* utf_fprint_printable_ascii **************************************************
1658 Write utf symbol into file.
1659 Non-printable and non-ASCII characters are printed as '?'.
1661 *******************************************************************************/
1663 void utf_fprint_printable_ascii(FILE *file, utf *u)
1665 char *endpos; /* points behind utf string */
1666 char *utf_ptr; /* current position in utf text */
1671 endpos = UTF_END(u);
1674 while (utf_ptr < endpos) {
1675 /* read next unicode character */
1676 u2 c = utf_nextu2(&utf_ptr);
1678 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1679 else fprintf(file, "?");
1684 /* utf_fprint_printable_ascii_classname ****************************************
1686 Write utf symbol into file with `/' converted to `.'.
1687 Non-printable and non-ASCII characters are printed as '?'.
1689 *******************************************************************************/
1691 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1693 char *endpos; /* points behind utf string */
1694 char *utf_ptr; /* current position in utf text */
1699 endpos = UTF_END(u);
1702 while (utf_ptr < endpos) {
1703 /* read next unicode character */
1704 u2 c = utf_nextu2(&utf_ptr);
1705 if (c == '/') c = '.';
1707 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1708 else fprintf(file, "?");
1713 /* is_valid_utf ****************************************************************
1715 Return true if the given string is a valid UTF-8 string.
1717 utf_ptr...points to first character
1718 end_pos...points after last character
1720 *******************************************************************************/
1722 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1724 bool is_valid_utf(char *utf_ptr, char *end_pos)
1731 if (end_pos < utf_ptr) return false;
1732 bytes = end_pos - utf_ptr;
1736 if (!c) return false; /* 0x00 is not allowed */
1737 if ((c & 0x80) == 0) continue; /* ASCII */
1739 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1740 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1741 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1742 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1743 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1744 else return false; /* invalid leading byte */
1746 if (len > 2) return false; /* Java limitation */
1748 v = (unsigned long)c & (0x3f >> len);
1750 if ((bytes -= len) < 0) return false; /* missing bytes */
1752 for (i = len; i--; ) {
1754 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1756 v = (v << 6) | (c & 0x3f);
1760 if (len != 1) return false; /* Java special */
1763 /* Sun Java seems to allow overlong UTF-8 encodings */
1765 /* if (v < min_codepoint[len]) */
1766 /* XXX throw exception? */
1769 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1770 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1772 /* even these seem to be allowed */
1773 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1780 /* is_valid_name ***************************************************************
1782 Return true if the given string may be used as a class/field/method
1783 name. (Currently this only disallows empty strings and control
1786 NOTE: The string is assumed to have passed is_valid_utf!
1788 utf_ptr...points to first character
1789 end_pos...points after last character
1791 *******************************************************************************/
1793 bool is_valid_name(char *utf_ptr, char *end_pos)
1795 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1797 while (utf_ptr < end_pos) {
1798 unsigned char c = *utf_ptr++;
1800 if (c < 0x20) return false; /* disallow control characters */
1801 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1808 bool is_valid_name_utf(utf *u)
1810 return is_valid_name(u->text, UTF_END(u));
1814 /* utf_show ********************************************************************
1816 Writes the utf symbols in the utfhash to stdout and displays the
1817 number of external hash chains grouped according to the chainlength
1818 (for debugging purposes).
1820 *******************************************************************************/
1822 #if !defined(NDEBUG)
1826 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1828 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1829 u4 max_chainlength = 0; /* maximum length of the chains */
1830 u4 sum_chainlength = 0; /* sum of the chainlengths */
1831 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1834 printf("UTF-HASH:\n");
1836 /* show element of utf-hashtable */
1838 for (i = 0; i < hashtable_utf->size; i++) {
1839 utf *u = hashtable_utf->ptr[i];
1842 printf("SLOT %d: ", (int) i);
1846 utf_display_printable_ascii(u);
1854 printf("UTF-HASH: %d slots for %d entries\n",
1855 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1857 if (hashtable_utf->entries == 0)
1860 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1862 for (i=0;i<CHAIN_LIMIT;i++)
1865 /* count numbers of hashchains according to their length */
1866 for (i=0; i<hashtable_utf->size; i++) {
1868 utf *u = (utf*) hashtable_utf->ptr[i];
1869 u4 chain_length = 0;
1871 /* determine chainlength */
1877 /* update sum of all chainlengths */
1878 sum_chainlength+=chain_length;
1880 /* determine the maximum length of the chains */
1881 if (chain_length>max_chainlength)
1882 max_chainlength = chain_length;
1884 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1885 if (chain_length>=CHAIN_LIMIT) {
1886 beyond_limit+=chain_length;
1887 chain_length=CHAIN_LIMIT-1;
1890 /* update number of hashchains of current length */
1891 chain_count[chain_length]++;
1894 /* display results */
1895 for (i=1;i<CHAIN_LIMIT-1;i++)
1896 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1898 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1901 printf("max. chainlength:%5d\n",max_chainlength);
1903 /* avg. chainlength = sum of chainlengths / number of chains */
1904 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1906 #endif /* !defined(NDEBUG) */
1910 * These are local overrides for various environment variables in Emacs.
1911 * Please do not remove this and leave it at the end of the file, where
1912 * Emacs will automagically detect them.
1913 * ---------------------------------------------------------------------
1916 * indent-tabs-mode: t
1920 * vim:noexpandtab:sw=4:ts=4: