1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007, 2008
4 CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
6 This file is part of CACAO.
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License as
10 published by the Free Software Foundation; either version 2, or (at
11 your option) any later version.
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "mm/memory.hpp"
35 #include "threads/mutex.hpp"
37 #include "toolbox/hashtable.h"
39 #include "vm/exceptions.hpp"
40 #include "vm/options.h"
42 #if defined(ENABLE_STATISTICS)
43 # include "vm/statistics.h"
49 /* global variables ***********************************************************/
51 /* hashsize must be power of 2 */
53 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
55 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
58 /* utf-symbols for pointer comparison of frequently used strings **************/
60 utf *utf_java_lang_Object;
62 utf *utf_java_lang_Class;
63 utf *utf_java_lang_ClassLoader;
64 utf *utf_java_lang_Cloneable;
65 utf *utf_java_lang_SecurityManager;
66 utf *utf_java_lang_String;
67 utf *utf_java_lang_ThreadGroup;
68 utf *utf_java_lang_ref_SoftReference;
69 utf *utf_java_lang_ref_WeakReference;
70 utf *utf_java_lang_ref_PhantomReference;
71 utf *utf_java_io_Serializable;
73 utf *utf_java_lang_Throwable;
74 utf *utf_java_lang_Error;
76 utf *utf_java_lang_AbstractMethodError;
77 utf *utf_java_lang_ClassCircularityError;
78 utf *utf_java_lang_ClassFormatError;
79 utf *utf_java_lang_ExceptionInInitializerError;
80 utf *utf_java_lang_IncompatibleClassChangeError;
81 utf *utf_java_lang_InstantiationError;
82 utf *utf_java_lang_InternalError;
83 utf *utf_java_lang_LinkageError;
84 utf *utf_java_lang_NoClassDefFoundError;
85 utf *utf_java_lang_NoSuchFieldError;
86 utf *utf_java_lang_NoSuchMethodError;
87 utf *utf_java_lang_OutOfMemoryError;
88 utf *utf_java_lang_UnsatisfiedLinkError;
89 utf *utf_java_lang_UnsupportedClassVersionError;
90 utf *utf_java_lang_VerifyError;
91 utf *utf_java_lang_VirtualMachineError;
93 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ArithmeticException;
96 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
97 utf *utf_java_lang_ArrayStoreException;
98 utf *utf_java_lang_ClassCastException;
99 utf *utf_java_lang_ClassNotFoundException;
100 utf *utf_java_lang_CloneNotSupportedException;
101 utf *utf_java_lang_IllegalAccessException;
102 utf *utf_java_lang_IllegalArgumentException;
103 utf *utf_java_lang_IllegalMonitorStateException;
104 utf *utf_java_lang_InstantiationException;
105 utf *utf_java_lang_InterruptedException;
106 utf *utf_java_lang_NegativeArraySizeException;
107 utf *utf_java_lang_NullPointerException;
108 utf *utf_java_lang_RuntimeException;
109 utf *utf_java_lang_StringIndexOutOfBoundsException;
111 utf *utf_java_lang_reflect_InvocationTargetException;
113 utf *utf_java_security_PrivilegedActionException;
115 #if defined(ENABLE_JAVASE)
116 utf* utf_java_lang_Void;
119 utf* utf_java_lang_Boolean;
120 utf* utf_java_lang_Byte;
121 utf* utf_java_lang_Character;
122 utf* utf_java_lang_Short;
123 utf* utf_java_lang_Integer;
124 utf* utf_java_lang_Long;
125 utf* utf_java_lang_Float;
126 utf* utf_java_lang_Double;
128 #if defined(ENABLE_JAVASE)
129 utf *utf_java_lang_StackTraceElement;
130 utf *utf_java_lang_reflect_Constructor;
131 utf *utf_java_lang_reflect_Field;
132 utf *utf_java_lang_reflect_Method;
134 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
135 utf *utf_java_lang_reflect_VMConstructor;
136 utf *utf_java_lang_reflect_VMField;
137 utf *utf_java_lang_reflect_VMMethod;
140 utf *utf_java_util_Vector;
143 utf *utf_InnerClasses; /* InnerClasses */
144 utf *utf_ConstantValue; /* ConstantValue */
145 utf *utf_Code; /* Code */
146 utf *utf_Exceptions; /* Exceptions */
147 utf *utf_LineNumberTable; /* LineNumberTable */
148 utf *utf_SourceFile; /* SourceFile */
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
153 utf *utf_StackMapTable;
155 #if defined(ENABLE_ANNOTATIONS)
156 utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */
157 utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */
158 utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */
159 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
160 utf *utf_AnnotationDefault; /* AnnotationDefault */
164 utf *utf_init; /* <init> */
165 utf *utf_clinit; /* <clinit> */
166 utf *utf_clone; /* clone */
167 utf *utf_finalize; /* finalize */
170 utf *utf_run; /* run */
176 utf *utf_removeThread;
179 utf *utf_uncaughtException;
182 utf *utf_fillInStackTrace;
184 utf *utf_getSystemClassLoader;
187 utf *utf_loadClassInternal;
188 utf *utf_printStackTrace;
190 utf *utf_division_by_zero;
201 utf *utf_void__void; /* ()V */
202 utf *utf_boolean__void; /* (Z)V */
203 utf *utf_byte__void; /* (B)V */
204 utf *utf_char__void; /* (C)V */
205 utf *utf_short__void; /* (S)V */
206 utf *utf_int__void; /* (I)V */
207 utf *utf_long__void; /* (J)V */
208 utf *utf_float__void; /* (F)V */
209 utf *utf_double__void; /* (D)V */
211 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
212 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
213 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
214 utf *utf_java_lang_ClassLoader_java_lang_String__J;
215 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
216 utf *utf_java_lang_Object__java_lang_Object;
217 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
218 utf *utf_java_lang_String__java_lang_Class;
219 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
220 utf *utf_java_lang_Thread_java_lang_Throwable__V;
221 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
222 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
223 utf *utf_java_lang_Throwable__java_lang_Throwable;
225 utf *utf_not_named_yet; /* special name for unnamed classes */
227 utf *array_packagename;
230 /* utf_init ********************************************************************
232 Initializes the utf8 subsystem.
234 *******************************************************************************/
238 TRACESUBSYSTEMINITIALIZATION("utf8_init");
240 /* create utf8 hashtable */
242 hashtable_utf = NEW(hashtable);
244 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
246 #if defined(ENABLE_STATISTICS)
248 count_utf_len += sizeof(utf*) * hashtable_utf->size;
251 /* create utf-symbols for pointer comparison of frequently used strings */
253 utf_java_lang_Object = utf_new_char("java/lang/Object");
255 utf_java_lang_Class = utf_new_char("java/lang/Class");
256 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
257 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
258 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
259 utf_java_lang_String = utf_new_char("java/lang/String");
260 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
262 utf_java_lang_ref_SoftReference =
263 utf_new_char("java/lang/ref/SoftReference");
265 utf_java_lang_ref_WeakReference =
266 utf_new_char("java/lang/ref/WeakReference");
268 utf_java_lang_ref_PhantomReference =
269 utf_new_char("java/lang/ref/PhantomReference");
271 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
273 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
274 utf_java_lang_Error = utf_new_char("java/lang/Error");
276 utf_java_lang_ClassCircularityError =
277 utf_new_char("java/lang/ClassCircularityError");
279 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
281 utf_java_lang_ExceptionInInitializerError =
282 utf_new_char("java/lang/ExceptionInInitializerError");
284 utf_java_lang_IncompatibleClassChangeError =
285 utf_new_char("java/lang/IncompatibleClassChangeError");
287 utf_java_lang_InstantiationError =
288 utf_new_char("java/lang/InstantiationError");
290 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
291 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
293 utf_java_lang_NoClassDefFoundError =
294 utf_new_char("java/lang/NoClassDefFoundError");
296 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
298 utf_java_lang_UnsatisfiedLinkError =
299 utf_new_char("java/lang/UnsatisfiedLinkError");
301 utf_java_lang_UnsupportedClassVersionError =
302 utf_new_char("java/lang/UnsupportedClassVersionError");
304 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
306 utf_java_lang_VirtualMachineError =
307 utf_new_char("java/lang/VirtualMachineError");
309 #if defined(ENABLE_JAVASE)
310 utf_java_lang_AbstractMethodError =
311 utf_new_char("java/lang/AbstractMethodError");
313 utf_java_lang_NoSuchFieldError =
314 utf_new_char("java/lang/NoSuchFieldError");
316 utf_java_lang_NoSuchMethodError =
317 utf_new_char("java/lang/NoSuchMethodError");
320 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
322 utf_java_lang_ArithmeticException =
323 utf_new_char("java/lang/ArithmeticException");
325 utf_java_lang_ArrayIndexOutOfBoundsException =
326 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
328 utf_java_lang_ArrayStoreException =
329 utf_new_char("java/lang/ArrayStoreException");
331 utf_java_lang_ClassCastException =
332 utf_new_char("java/lang/ClassCastException");
334 utf_java_lang_ClassNotFoundException =
335 utf_new_char("java/lang/ClassNotFoundException");
337 utf_java_lang_CloneNotSupportedException =
338 utf_new_char("java/lang/CloneNotSupportedException");
340 utf_java_lang_IllegalAccessException =
341 utf_new_char("java/lang/IllegalAccessException");
343 utf_java_lang_IllegalArgumentException =
344 utf_new_char("java/lang/IllegalArgumentException");
346 utf_java_lang_IllegalMonitorStateException =
347 utf_new_char("java/lang/IllegalMonitorStateException");
349 utf_java_lang_InstantiationException =
350 utf_new_char("java/lang/InstantiationException");
352 utf_java_lang_InterruptedException =
353 utf_new_char("java/lang/InterruptedException");
355 utf_java_lang_NegativeArraySizeException =
356 utf_new_char("java/lang/NegativeArraySizeException");
358 utf_java_lang_NullPointerException =
359 utf_new_char("java/lang/NullPointerException");
361 utf_java_lang_RuntimeException =
362 utf_new_char("java/lang/RuntimeException");
364 utf_java_lang_StringIndexOutOfBoundsException =
365 utf_new_char("java/lang/StringIndexOutOfBoundsException");
367 utf_java_lang_reflect_InvocationTargetException =
368 utf_new_char("java/lang/reflect/InvocationTargetException");
370 utf_java_security_PrivilegedActionException =
371 utf_new_char("java/security/PrivilegedActionException");
373 #if defined(ENABLE_JAVASE)
374 utf_java_lang_Void = utf_new_char("java/lang/Void");
377 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
378 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
379 utf_java_lang_Character = utf_new_char("java/lang/Character");
380 utf_java_lang_Short = utf_new_char("java/lang/Short");
381 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
382 utf_java_lang_Long = utf_new_char("java/lang/Long");
383 utf_java_lang_Float = utf_new_char("java/lang/Float");
384 utf_java_lang_Double = utf_new_char("java/lang/Double");
386 #if defined(ENABLE_JAVASE)
387 utf_java_lang_StackTraceElement =
388 utf_new_char("java/lang/StackTraceElement");
390 utf_java_lang_reflect_Constructor =
391 utf_new_char("java/lang/reflect/Constructor");
393 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
394 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
396 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
397 utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
398 utf_java_lang_reflect_VMField = utf_new_char("java/lang/reflect/VMField");
399 utf_java_lang_reflect_VMMethod = utf_new_char("java/lang/reflect/VMMethod");
402 utf_java_util_Vector = utf_new_char("java/util/Vector");
405 utf_InnerClasses = utf_new_char("InnerClasses");
406 utf_ConstantValue = utf_new_char("ConstantValue");
407 utf_Code = utf_new_char("Code");
408 utf_Exceptions = utf_new_char("Exceptions");
409 utf_LineNumberTable = utf_new_char("LineNumberTable");
410 utf_SourceFile = utf_new_char("SourceFile");
412 #if defined(ENABLE_JAVASE)
413 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
414 utf_Signature = utf_new_char("Signature");
415 utf_StackMapTable = utf_new_char("StackMapTable");
417 # if defined(ENABLE_ANNOTATIONS)
418 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
419 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
420 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
421 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
422 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
426 utf_init = utf_new_char("<init>");
427 utf_clinit = utf_new_char("<clinit>");
428 utf_clone = utf_new_char("clone");
429 utf_finalize = utf_new_char("finalize");
430 utf_invoke = utf_new_char("invoke");
431 utf_main = utf_new_char("main");
432 utf_run = utf_new_char("run");
434 utf_add = utf_new_char("add");
435 utf_dispatch = utf_new_char("dispatch");
436 utf_remove = utf_new_char("remove");
437 utf_addThread = utf_new_char("addThread");
438 utf_removeThread = utf_new_char("removeThread");
439 utf_put = utf_new_char("put");
440 utf_get = utf_new_char("get");
441 utf_uncaughtException = utf_new_char("uncaughtException");
442 utf_value = utf_new_char("value");
444 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
445 utf_findNative = utf_new_char("findNative");
446 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
447 utf_initCause = utf_new_char("initCause");
448 utf_loadClass = utf_new_char("loadClass");
449 utf_loadClassInternal = utf_new_char("loadClassInternal");
450 utf_printStackTrace = utf_new_char("printStackTrace");
452 utf_division_by_zero = utf_new_char("/ by zero");
454 utf_Z = utf_new_char("Z");
455 utf_B = utf_new_char("B");
456 utf_C = utf_new_char("C");
457 utf_S = utf_new_char("S");
458 utf_I = utf_new_char("I");
459 utf_J = utf_new_char("J");
460 utf_F = utf_new_char("F");
461 utf_D = utf_new_char("D");
463 utf_void__void = utf_new_char("()V");
464 utf_boolean__void = utf_new_char("(Z)V");
465 utf_byte__void = utf_new_char("(B)V");
466 utf_char__void = utf_new_char("(C)V");
467 utf_short__void = utf_new_char("(S)V");
468 utf_int__void = utf_new_char("(I)V");
469 utf_long__void = utf_new_char("(J)V");
470 utf_float__void = utf_new_char("(F)V");
471 utf_double__void = utf_new_char("(D)V");
472 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
473 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
475 utf_void__java_lang_ClassLoader =
476 utf_new_char("()Ljava/lang/ClassLoader;");
478 utf_java_lang_ClassLoader_java_lang_String__J =
479 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
481 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
483 utf_java_lang_Object__java_lang_Object =
484 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
486 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
488 utf_java_lang_String__java_lang_Class =
489 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
491 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
493 utf_java_lang_Thread_java_lang_Throwable__V =
494 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
496 utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
497 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
499 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
501 utf_java_lang_Throwable__java_lang_Throwable =
502 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
504 utf_null = utf_new_char("null");
505 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
506 array_packagename = utf_new_char("\t<the array package>");
510 /* utf_hashkey *****************************************************************
512 The hashkey is computed from the utf-text by using up to 8
513 characters. For utf-symbols longer than 15 characters 3 characters
514 are taken from the beginning and the end, 2 characters are taken
517 *******************************************************************************/
519 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
520 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
522 u4 utf_hashkey(const char *text, u4 length)
524 const char *start_pos = text; /* pointer to utf text */
528 case 0: /* empty string */
531 case 1: return fbs(0);
532 case 2: return fbs(0) ^ nbs(3);
533 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
534 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
535 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
536 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
537 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
538 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
545 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
554 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
563 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
575 return a ^ nbs(9) ^ nbs(10);
587 return a ^ nbs(9) ^ nbs(10);
598 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
609 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
611 default: /* 3 characters from beginning */
617 /* 2 characters from middle */
618 text = start_pos + (length / 2);
623 /* 3 characters from end */
624 text = start_pos + length - 4;
629 return a ^ nbs(10) ^ nbs(11);
633 /* utf_full_hashkey ************************************************************
635 This function computes a hash value using all bytes in the string.
637 The algorithm is the "One-at-a-time" algorithm as published
638 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
640 *******************************************************************************/
642 u4 utf_full_hashkey(const char *text, u4 length)
644 register const unsigned char *p = (const unsigned char *) text;
652 hash += (hash << 10);
656 hash ^= (hash >> 11);
657 hash += (hash << 15);
662 /* unicode_hashkey *************************************************************
664 Compute the hashkey of a unicode string.
666 *******************************************************************************/
668 u4 unicode_hashkey(u2 *text, u2 len)
670 return utf_hashkey((char *) text, len);
674 /* utf_new *********************************************************************
676 Creates a new utf-symbol, the text of the symbol is passed as a
677 u1-array. The function searches the utf-hashtable for a utf-symbol
678 with this text. On success the element returned, otherwise a new
679 hashtable element is created.
681 If the number of entries in the hashtable exceeds twice the size of
682 the hashtable slots a reorganization of the hashtable is done and
683 the utf symbols are copied to a new hashtable with doubled size.
685 *******************************************************************************/
687 utf *utf_new(const char *text, u2 length)
689 u4 key; /* hashkey computed from utf-text */
690 u4 slot; /* slot in hashtable */
691 utf *u; /* hashtable element */
694 Mutex_lock(hashtable_utf->mutex);
696 #if defined(ENABLE_STATISTICS)
701 key = utf_hashkey(text, length);
702 slot = key & (hashtable_utf->size - 1);
703 u = hashtable_utf->ptr[slot];
705 /* search external hash chain for utf-symbol */
708 if (u->blength == length) {
709 /* compare text of hashtable elements */
711 for (i = 0; i < length; i++)
712 if (text[i] != u->text[i])
715 #if defined(ENABLE_STATISTICS)
717 count_utf_new_found++;
720 /* symbol found in hashtable */
722 Mutex_unlock(hashtable_utf->mutex);
728 u = u->hashlink; /* next element in external chain */
731 /* location in hashtable found, create new utf element */
735 u->blength = length; /* length in bytes of utfstring */
736 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
737 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
739 memcpy(u->text, text, length); /* copy utf-text */
740 u->text[length] = '\0';
742 #if defined(ENABLE_STATISTICS)
744 count_utf_len += sizeof(utf) + length + 1;
747 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
748 hashtable_utf->entries++; /* update number of entries */
750 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
752 /* reorganization of hashtable, average length of the external
753 chains is approx. 2 */
755 hashtable *newhash; /* the new hashtable */
761 /* create new hashtable, double the size */
763 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
765 #if defined(ENABLE_STATISTICS)
767 count_utf_len += sizeof(utf*) * hashtable_utf->size;
770 /* transfer elements to new hashtable */
772 for (i = 0; i < hashtable_utf->size; i++) {
773 u = hashtable_utf->ptr[i];
777 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
779 u->hashlink = (utf *) newhash->ptr[slot];
780 newhash->ptr[slot] = u;
782 /* follow link in external hash chain */
788 /* dispose old table */
790 hashtable_free(hashtable_utf);
792 hashtable_utf = newhash;
795 Mutex_unlock(hashtable_utf->mutex);
801 /* utf_new_u2 ******************************************************************
803 Make utf symbol from u2 array, if isclassname is true '.' is
806 *******************************************************************************/
808 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
810 char *buffer; /* memory buffer for unicode characters */
811 char *pos; /* pointer to current position in buffer */
812 u4 left; /* unicode characters left */
813 u4 buflength; /* utf length in bytes of the u2 array */
814 utf *result; /* resulting utf-string */
817 /* determine utf length in bytes and allocate memory */
819 buflength = u2_utflength(unicode_pos, unicode_length);
820 buffer = MNEW(char, buflength);
825 for (i = 0; i++ < unicode_length; unicode_pos++) {
826 /* next unicode character */
829 if ((c != 0) && (c < 0x80)) {
832 if ((int) left < 0) break;
833 /* convert classname */
834 if (isclassname && c == '.')
839 } else if (c < 0x800) {
841 unsigned char high = c >> 6;
842 unsigned char low = c & 0x3F;
844 if ((int) left < 0) break;
845 *pos++ = high | 0xC0;
851 char mid = (c >> 6) & 0x3F;
854 if ((int) left < 0) break;
855 *pos++ = high | 0xE0;
861 /* insert utf-string into symbol-table */
862 result = utf_new(buffer,buflength);
864 MFREE(buffer, char, buflength);
870 /* utf_new_char ****************************************************************
872 Creates a new utf symbol, the text for this symbol is passed as a
873 c-string ( = char* ).
875 *******************************************************************************/
877 utf *utf_new_char(const char *text)
879 return utf_new(text, strlen(text));
883 /* utf_new_char_classname ******************************************************
885 Creates a new utf symbol, the text for this symbol is passed as a
886 c-string ( = char* ) "." characters are going to be replaced by
887 "/". Since the above function is used often, this is a separte
888 function, instead of an if.
890 *******************************************************************************/
892 utf *utf_new_char_classname(const char *text)
894 if (strchr(text, '.')) {
895 char *txt = strdup(text);
896 char *end = txt + strlen(txt);
900 for (c = txt; c < end; c++)
901 if (*c == '.') *c = '/';
903 tmpRes = utf_new(txt, strlen(txt));
909 return utf_new(text, strlen(text));
913 /* utf_nextu2 ******************************************************************
915 Read the next unicode character from the utf string and increment
916 the utf-string pointer accordingly.
918 CAUTION: This function is unsafe for input that was not checked
921 *******************************************************************************/
923 u2 utf_nextu2(char **utf_ptr)
925 /* uncompressed unicode character */
927 /* current position in utf text */
928 unsigned char *utf = (unsigned char *) (*utf_ptr);
929 /* bytes representing the unicode character */
930 unsigned char ch1, ch2, ch3;
931 /* number of bytes used to represent the unicode character */
934 switch ((ch1 = utf[0]) >> 4) {
935 default: /* 1 byte */
939 case 0xD: /* 2 bytes */
940 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
941 unsigned char high = ch1 & 0x1F;
942 unsigned char low = ch2 & 0x3F;
943 unicode_char = (high << 6) + low;
948 case 0xE: /* 2 or 3 bytes */
949 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
950 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
951 unsigned char low = ch3 & 0x3f;
952 unsigned char mid = ch2 & 0x3f;
953 unsigned char high = ch1 & 0x0f;
954 unicode_char = (((high << 6) + mid) << 6) + low;
962 /* update position in utf-text */
963 *utf_ptr = (char *) (utf + len);
969 /* utf_bytes *******************************************************************
971 Determine number of bytes (aka. octets) in the utf string.
974 u............utf string
977 The number of octets of this utf string.
978 There is _no_ terminating zero included in this count.
980 *******************************************************************************/
988 /* utf_get_number_of_u2s_for_buffer ********************************************
990 Determine number of UTF-16 u2s in the given UTF-8 buffer
992 CAUTION: This function is unsafe for input that was not checked
995 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
996 to an array of u2s (UTF-16) and want to know how many of them you will get.
997 All other uses of this function are probably wrong.
1000 buffer........points to first char in buffer
1001 blength.......number of _bytes_ in the buffer
1004 the number of u2s needed to hold this string in UTF-16 encoding.
1005 There is _no_ terminating zero included in this count.
1007 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1010 *******************************************************************************/
1012 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1014 const char *endpos; /* points behind utf string */
1015 const char *utf_ptr; /* current position in utf text */
1016 u4 len = 0; /* number of unicode characters */
1019 endpos = utf_ptr + blength;
1021 while (utf_ptr < endpos) {
1023 /* next unicode character */
1024 utf_nextu2((char **)&utf_ptr);
1027 assert(utf_ptr == endpos);
1033 /* utf_get_number_of_u2s *******************************************************
1035 Determine number of UTF-16 u2s in the utf string.
1037 CAUTION: This function is unsafe for input that was not checked
1040 CAUTION: Use this function *only* when you want to convert a utf string
1041 to an array of u2s and want to know how many of them you will get.
1042 All other uses of this function are probably wrong.
1045 u............utf string
1048 the number of u2s needed to hold this string in UTF-16 encoding.
1049 There is _no_ terminating zero included in this count.
1050 XXX 0 if a NullPointerException has been thrown (see below)
1052 *******************************************************************************/
1054 u4 utf_get_number_of_u2s(utf *u)
1056 char *endpos; /* points behind utf string */
1057 char *utf_ptr; /* current position in utf text */
1058 u4 len = 0; /* number of unicode characters */
1060 /* XXX this is probably not checked by most callers! Review this after */
1061 /* the invalid uses of this function have been eliminated */
1063 exceptions_throw_nullpointerexception();
1067 endpos = UTF_END(u);
1070 while (utf_ptr < endpos) {
1072 /* next unicode character */
1073 utf_nextu2(&utf_ptr);
1076 if (utf_ptr != endpos) {
1077 /* string ended abruptly */
1078 exceptions_throw_internalerror("Illegal utf8 string");
1086 /* utf8_safe_number_of_u2s *****************************************************
1088 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1089 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1091 This function is safe even for invalid UTF-8 strings.
1094 text..........zero-terminated(!) UTF-8 string (may be invalid)
1096 nbytes........strlen(text). (This is needed to completely emulate
1100 the number of u2s needed to hold this string in UTF-16 encoding.
1101 There is _no_ terminating zero included in this count.
1103 *******************************************************************************/
1105 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1106 register const unsigned char *t;
1109 register const unsigned char *tlimit;
1117 assert(nbytes >= 0);
1120 t = (const unsigned char *) text;
1121 tlimit = t + nbytes;
1123 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1129 /* highest bit set, non-ASCII character */
1131 if ((byte & 0xe0) == 0xc0) {
1132 /* 2-byte: should be 110..... 10...... ? */
1134 if ((*t++ & 0xc0) == 0x80)
1135 ; /* valid 2-byte */
1139 else if ((byte & 0xf0) == 0xe0) {
1140 /* 3-byte: should be 1110.... 10...... 10...... */
1144 return len + 1; /* invalid, stop here */
1146 if ((*t++ & 0xc0) == 0x80) {
1147 if ((*t++ & 0xc0) == 0x80)
1148 ; /* valid 3-byte */
1155 else if ((byte & 0xf8) == 0xf0) {
1156 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1160 return len + 1; /* invalid, stop here */
1162 if (((byte1 = *t++) & 0xc0) == 0x80) {
1163 if (((byte2 = *t++) & 0xc0) == 0x80) {
1164 if (((byte3 = *t++) & 0xc0) == 0x80) {
1165 /* valid 4-byte UTF-8? */
1166 value = ((byte & 0x07) << 18)
1167 | ((byte1 & 0x3f) << 12)
1168 | ((byte2 & 0x3f) << 6)
1169 | ((byte3 & 0x3f) );
1171 if (value > 0x10FFFF)
1173 else if (value > 0xFFFF)
1174 len += 1; /* we need surrogates */
1176 ; /* 16bit suffice */
1187 else if ((byte & 0xfc) == 0xf8) {
1188 /* invalid 5-byte */
1190 return len + 1; /* invalid, stop here */
1193 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1196 else if ((byte & 0xfe) == 0xfc) {
1197 /* invalid 6-byte */
1199 return len + 1; /* invalid, stop here */
1202 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1214 /* ASCII character, common case */
1224 /* utf8_safe_convert_to_u2s ****************************************************
1226 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1227 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1228 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1230 This function is safe even for invalid UTF-8 strings.
1233 text..........zero-terminated(!) UTF-8 string (may be invalid)
1235 nbytes........strlen(text). (This is needed to completely emulate
1237 buffer........a preallocated array of u2s to receive the decoded
1238 string. Use utf8_safe_number_of_u2s to get the
1239 required number of u2s for allocating this.
1241 *******************************************************************************/
1243 #define UNICODE_REPLACEMENT 0xfffd
1245 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1246 register const unsigned char *t;
1248 register const unsigned char *tlimit;
1256 assert(nbytes >= 0);
1258 t = (const unsigned char *) text;
1259 tlimit = t + nbytes;
1261 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1267 /* highest bit set, non-ASCII character */
1269 if ((byte & 0xe0) == 0xc0) {
1270 /* 2-byte: should be 110..... 10...... */
1272 if (((byte1 = *t++) & 0xc0) == 0x80) {
1273 /* valid 2-byte UTF-8 */
1274 *buffer++ = ((byte & 0x1f) << 6)
1275 | ((byte1 & 0x3f) );
1278 *buffer++ = UNICODE_REPLACEMENT;
1282 else if ((byte & 0xf0) == 0xe0) {
1283 /* 3-byte: should be 1110.... 10...... 10...... */
1285 if (t + 2 > tlimit) {
1286 *buffer++ = UNICODE_REPLACEMENT;
1290 if (((byte1 = *t++) & 0xc0) == 0x80) {
1291 if (((byte2 = *t++) & 0xc0) == 0x80) {
1292 /* valid 3-byte UTF-8 */
1293 *buffer++ = ((byte & 0x0f) << 12)
1294 | ((byte1 & 0x3f) << 6)
1295 | ((byte2 & 0x3f) );
1298 *buffer++ = UNICODE_REPLACEMENT;
1303 *buffer++ = UNICODE_REPLACEMENT;
1307 else if ((byte & 0xf8) == 0xf0) {
1308 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1310 if (t + 3 > tlimit) {
1311 *buffer++ = UNICODE_REPLACEMENT;
1315 if (((byte1 = *t++) & 0xc0) == 0x80) {
1316 if (((byte2 = *t++) & 0xc0) == 0x80) {
1317 if (((byte3 = *t++) & 0xc0) == 0x80) {
1318 /* valid 4-byte UTF-8? */
1319 value = ((byte & 0x07) << 18)
1320 | ((byte1 & 0x3f) << 12)
1321 | ((byte2 & 0x3f) << 6)
1322 | ((byte3 & 0x3f) );
1324 if (value > 0x10FFFF) {
1325 *buffer++ = UNICODE_REPLACEMENT;
1327 else if (value > 0xFFFF) {
1328 /* we need surrogates */
1329 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1330 *buffer++ = 0xdc00 | (value & 0x03ff);
1333 *buffer++ = value; /* 16bit suffice */
1336 *buffer++ = UNICODE_REPLACEMENT;
1341 *buffer++ = UNICODE_REPLACEMENT;
1346 *buffer++ = UNICODE_REPLACEMENT;
1350 else if ((byte & 0xfc) == 0xf8) {
1351 if (t + 4 > tlimit) {
1352 *buffer++ = UNICODE_REPLACEMENT;
1357 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1359 *buffer++ = UNICODE_REPLACEMENT;
1361 else if ((byte & 0xfe) == 0xfc) {
1362 if (t + 5 > tlimit) {
1363 *buffer++ = UNICODE_REPLACEMENT;
1368 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1370 *buffer++ = UNICODE_REPLACEMENT;
1373 *buffer++ = UNICODE_REPLACEMENT;
1381 /* ASCII character, common case */
1389 /* u2_utflength ****************************************************************
1391 Returns the utf length in bytes of a u2 array.
1393 *******************************************************************************/
1395 u4 u2_utflength(u2 *text, u4 u2_length)
1397 u4 result_len = 0; /* utf length in bytes */
1398 u2 ch; /* current unicode character */
1401 for (len = 0; len < u2_length; len++) {
1402 /* next unicode character */
1405 /* determine bytes required to store unicode character as utf */
1406 if (ch && (ch < 0x80))
1408 else if (ch < 0x800)
1418 /* utf_copy ********************************************************************
1420 Copy the given utf string byte-for-byte to a buffer.
1423 buffer.......the buffer
1424 u............the utf string
1426 *******************************************************************************/
1428 void utf_copy(char *buffer, utf *u)
1430 /* our utf strings are zero-terminated (done by utf_new) */
1431 MCOPY(buffer, u->text, char, u->blength + 1);
1435 /* utf_cat *********************************************************************
1437 Append the given utf string byte-for-byte to a buffer.
1440 buffer.......the buffer
1441 u............the utf string
1443 *******************************************************************************/
1445 void utf_cat(char *buffer, utf *u)
1447 /* our utf strings are zero-terminated (done by utf_new) */
1448 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1452 /* utf_copy_classname **********************************************************
1454 Copy the given utf classname byte-for-byte to a buffer.
1455 '/' is replaced by '.'
1458 buffer.......the buffer
1459 u............the utf string
1461 *******************************************************************************/
1463 void utf_copy_classname(char *buffer, utf *u)
1472 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1474 while (srcptr != endptr) {
1483 /* utf_cat *********************************************************************
1485 Append the given utf classname byte-for-byte to a buffer.
1486 '/' is replaced by '.'
1489 buffer.......the buffer
1490 u............the utf string
1492 *******************************************************************************/
1494 void utf_cat_classname(char *buffer, utf *u)
1496 utf_copy_classname(buffer + strlen(buffer), u);
1499 /* utf_display_printable_ascii *************************************************
1501 Write utf symbol to stdout (for debugging purposes).
1502 Non-printable and non-ASCII characters are printed as '?'.
1504 *******************************************************************************/
1506 void utf_display_printable_ascii(utf *u)
1508 char *endpos; /* points behind utf string */
1509 char *utf_ptr; /* current position in utf text */
1517 endpos = UTF_END(u);
1520 while (utf_ptr < endpos) {
1521 /* read next unicode character */
1523 u2 c = utf_nextu2(&utf_ptr);
1525 if ((c >= 32) && (c <= 127))
1535 /* utf_display_printable_ascii_classname ***************************************
1537 Write utf symbol to stdout with `/' converted to `.' (for debugging
1539 Non-printable and non-ASCII characters are printed as '?'.
1541 *******************************************************************************/
1543 void utf_display_printable_ascii_classname(utf *u)
1545 char *endpos; /* points behind utf string */
1546 char *utf_ptr; /* current position in utf text */
1554 endpos = UTF_END(u);
1557 while (utf_ptr < endpos) {
1558 /* read next unicode character */
1560 u2 c = utf_nextu2(&utf_ptr);
1565 if ((c >= 32) && (c <= 127))
1575 /* utf_sprint_convert_to_latin1 ************************************************
1577 Write utf symbol into c-string (for debugging purposes).
1578 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1581 *******************************************************************************/
1583 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1585 char *endpos; /* points behind utf string */
1586 char *utf_ptr; /* current position in utf text */
1587 u2 pos = 0; /* position in c-string */
1590 strcpy(buffer, "NULL");
1594 endpos = UTF_END(u);
1597 while (utf_ptr < endpos)
1598 /* copy next unicode character */
1599 buffer[pos++] = utf_nextu2(&utf_ptr);
1601 /* terminate string */
1606 /* utf_sprint_convert_to_latin1_classname **************************************
1608 Write utf symbol into c-string with `/' converted to `.' (for debugging
1610 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1613 *******************************************************************************/
1615 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1617 char *endpos; /* points behind utf string */
1618 char *utf_ptr; /* current position in utf text */
1619 u2 pos = 0; /* position in c-string */
1622 strcpy(buffer, "NULL");
1626 endpos = UTF_END(u);
1629 while (utf_ptr < endpos) {
1630 /* copy next unicode character */
1631 u2 c = utf_nextu2(&utf_ptr);
1632 if (c == '/') c = '.';
1636 /* terminate string */
1641 /* utf_strcat_convert_to_latin1 ************************************************
1643 Like libc strcat, but uses an utf8 string.
1644 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1647 *******************************************************************************/
1649 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1651 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1655 /* utf_strcat_convert_to_latin1_classname **************************************
1657 Like libc strcat, but uses an utf8 string.
1658 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1661 *******************************************************************************/
1663 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1665 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1669 /* utf_fprint_printable_ascii **************************************************
1671 Write utf symbol into file.
1672 Non-printable and non-ASCII characters are printed as '?'.
1674 *******************************************************************************/
1676 void utf_fprint_printable_ascii(FILE *file, utf *u)
1678 char *endpos; /* points behind utf string */
1679 char *utf_ptr; /* current position in utf text */
1684 endpos = UTF_END(u);
1687 while (utf_ptr < endpos) {
1688 /* read next unicode character */
1689 u2 c = utf_nextu2(&utf_ptr);
1691 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1692 else fprintf(file, "?");
1697 /* utf_fprint_printable_ascii_classname ****************************************
1699 Write utf symbol into file with `/' converted to `.'.
1700 Non-printable and non-ASCII characters are printed as '?'.
1702 *******************************************************************************/
1704 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1706 char *endpos; /* points behind utf string */
1707 char *utf_ptr; /* current position in utf text */
1712 endpos = UTF_END(u);
1715 while (utf_ptr < endpos) {
1716 /* read next unicode character */
1717 u2 c = utf_nextu2(&utf_ptr);
1718 if (c == '/') c = '.';
1720 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1721 else fprintf(file, "?");
1726 /* is_valid_utf ****************************************************************
1728 Return true if the given string is a valid UTF-8 string.
1730 utf_ptr...points to first character
1731 end_pos...points after last character
1733 *******************************************************************************/
1735 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1737 bool is_valid_utf(char *utf_ptr, char *end_pos)
1744 if (end_pos < utf_ptr) return false;
1745 bytes = end_pos - utf_ptr;
1749 if (!c) return false; /* 0x00 is not allowed */
1750 if ((c & 0x80) == 0) continue; /* ASCII */
1752 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1753 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1754 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1755 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1756 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1757 else return false; /* invalid leading byte */
1759 if (len > 2) return false; /* Java limitation */
1761 v = (unsigned long)c & (0x3f >> len);
1763 if ((bytes -= len) < 0) return false; /* missing bytes */
1765 for (i = len; i--; ) {
1767 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1769 v = (v << 6) | (c & 0x3f);
1773 if (len != 1) return false; /* Java special */
1776 /* Sun Java seems to allow overlong UTF-8 encodings */
1778 /* if (v < min_codepoint[len]) */
1779 /* XXX throw exception? */
1782 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1783 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1785 /* even these seem to be allowed */
1786 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1793 /* is_valid_name ***************************************************************
1795 Return true if the given string may be used as a class/field/method
1796 name. (Currently this only disallows empty strings and control
1799 NOTE: The string is assumed to have passed is_valid_utf!
1801 utf_ptr...points to first character
1802 end_pos...points after last character
1804 *******************************************************************************/
1806 bool is_valid_name(char *utf_ptr, char *end_pos)
1808 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1810 while (utf_ptr < end_pos) {
1811 unsigned char c = *utf_ptr++;
1813 if (c < 0x20) return false; /* disallow control characters */
1814 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1821 bool is_valid_name_utf(utf *u)
1823 return is_valid_name(u->text, UTF_END(u));
1827 /* utf_show ********************************************************************
1829 Writes the utf symbols in the utfhash to stdout and displays the
1830 number of external hash chains grouped according to the chainlength
1831 (for debugging purposes).
1833 *******************************************************************************/
1835 #if !defined(NDEBUG)
1839 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1841 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1842 u4 max_chainlength = 0; /* maximum length of the chains */
1843 u4 sum_chainlength = 0; /* sum of the chainlengths */
1844 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1847 printf("UTF-HASH:\n");
1849 /* show element of utf-hashtable */
1851 for (i = 0; i < hashtable_utf->size; i++) {
1852 utf *u = hashtable_utf->ptr[i];
1855 printf("SLOT %d: ", (int) i);
1859 utf_display_printable_ascii(u);
1867 printf("UTF-HASH: %d slots for %d entries\n",
1868 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1870 if (hashtable_utf->entries == 0)
1873 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1875 for (i=0;i<CHAIN_LIMIT;i++)
1878 /* count numbers of hashchains according to their length */
1879 for (i=0; i<hashtable_utf->size; i++) {
1881 utf *u = (utf*) hashtable_utf->ptr[i];
1882 u4 chain_length = 0;
1884 /* determine chainlength */
1890 /* update sum of all chainlengths */
1891 sum_chainlength+=chain_length;
1893 /* determine the maximum length of the chains */
1894 if (chain_length>max_chainlength)
1895 max_chainlength = chain_length;
1897 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1898 if (chain_length>=CHAIN_LIMIT) {
1899 beyond_limit+=chain_length;
1900 chain_length=CHAIN_LIMIT-1;
1903 /* update number of hashchains of current length */
1904 chain_count[chain_length]++;
1907 /* display results */
1908 for (i=1;i<CHAIN_LIMIT-1;i++)
1909 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1911 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1914 printf("max. chainlength:%5d\n",max_chainlength);
1916 /* avg. chainlength = sum of chainlengths / number of chains */
1917 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1919 #endif /* !defined(NDEBUG) */
1923 * These are local overrides for various environment variables in Emacs.
1924 * Please do not remove this and leave it at the end of the file, where
1925 * Emacs will automagically detect them.
1926 * ---------------------------------------------------------------------
1929 * indent-tabs-mode: t
1933 * vim:noexpandtab:sw=4:ts=4: