1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007, 2008
4 CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
6 This file is part of CACAO.
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License as
10 published by the Free Software Foundation; either version 2, or (at
11 your option) any later version.
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "mm/memory.hpp"
35 #include "threads/mutex.hpp"
37 #include "toolbox/hashtable.h"
39 #include "vm/exceptions.hpp"
40 #include "vm/options.h"
42 #if defined(ENABLE_STATISTICS)
43 # include "vm/statistics.h"
49 /* global variables ***********************************************************/
51 /* hashsize must be power of 2 */
53 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
55 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
58 /* utf-symbols for pointer comparison of frequently used strings **************/
60 utf *utf_java_lang_Object;
62 utf *utf_java_lang_Class;
63 utf *utf_java_lang_ClassLoader;
64 utf *utf_java_lang_Cloneable;
65 utf *utf_java_lang_SecurityManager;
66 utf *utf_java_lang_String;
67 utf *utf_java_lang_ThreadGroup;
68 utf *utf_java_lang_ref_SoftReference;
69 utf *utf_java_lang_ref_WeakReference;
70 utf *utf_java_lang_ref_PhantomReference;
71 utf *utf_java_io_Serializable;
73 utf *utf_java_lang_Throwable;
74 utf *utf_java_lang_Error;
76 utf *utf_java_lang_AbstractMethodError;
77 utf *utf_java_lang_ClassCircularityError;
78 utf *utf_java_lang_ClassFormatError;
79 utf *utf_java_lang_ExceptionInInitializerError;
80 utf *utf_java_lang_IncompatibleClassChangeError;
81 utf *utf_java_lang_InstantiationError;
82 utf *utf_java_lang_InternalError;
83 utf *utf_java_lang_LinkageError;
84 utf *utf_java_lang_NoClassDefFoundError;
85 utf *utf_java_lang_NoSuchFieldError;
86 utf *utf_java_lang_NoSuchMethodError;
87 utf *utf_java_lang_OutOfMemoryError;
88 utf *utf_java_lang_UnsatisfiedLinkError;
89 utf *utf_java_lang_UnsupportedClassVersionError;
90 utf *utf_java_lang_VerifyError;
91 utf *utf_java_lang_VirtualMachineError;
93 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ArithmeticException;
96 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
97 utf *utf_java_lang_ArrayStoreException;
98 utf *utf_java_lang_ClassCastException;
99 utf *utf_java_lang_ClassNotFoundException;
100 utf *utf_java_lang_CloneNotSupportedException;
101 utf *utf_java_lang_IllegalAccessException;
102 utf *utf_java_lang_IllegalArgumentException;
103 utf *utf_java_lang_IllegalMonitorStateException;
104 utf *utf_java_lang_InstantiationException;
105 utf *utf_java_lang_InterruptedException;
106 utf *utf_java_lang_NegativeArraySizeException;
107 utf *utf_java_lang_NullPointerException;
108 utf *utf_java_lang_RuntimeException;
109 utf *utf_java_lang_StringIndexOutOfBoundsException;
111 utf *utf_java_lang_reflect_InvocationTargetException;
113 utf *utf_java_security_PrivilegedActionException;
115 #if defined(ENABLE_JAVASE)
116 utf* utf_java_lang_Void;
119 utf* utf_java_lang_Boolean;
120 utf* utf_java_lang_Byte;
121 utf* utf_java_lang_Character;
122 utf* utf_java_lang_Short;
123 utf* utf_java_lang_Integer;
124 utf* utf_java_lang_Long;
125 utf* utf_java_lang_Float;
126 utf* utf_java_lang_Double;
128 #if defined(ENABLE_JAVASE)
129 utf *utf_java_lang_StackTraceElement;
130 utf *utf_java_lang_reflect_Constructor;
131 utf *utf_java_lang_reflect_Field;
132 utf *utf_java_lang_reflect_Method;
134 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
135 utf *utf_java_lang_reflect_VMConstructor;
136 utf *utf_java_lang_reflect_VMField;
137 utf *utf_java_lang_reflect_VMMethod;
140 utf *utf_java_util_Vector;
143 utf *utf_InnerClasses; /* InnerClasses */
144 utf *utf_ConstantValue; /* ConstantValue */
145 utf *utf_Code; /* Code */
146 utf *utf_Exceptions; /* Exceptions */
147 utf *utf_LineNumberTable; /* LineNumberTable */
148 utf *utf_SourceFile; /* SourceFile */
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
153 utf *utf_StackMapTable;
155 # if defined(ENABLE_JVMTI)
156 utf *utf_LocalVariableTable;
159 # if defined(ENABLE_ANNOTATIONS)
160 utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */
161 utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */
162 utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */
163 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
164 utf *utf_AnnotationDefault; /* AnnotationDefault */
168 utf *utf_init; /* <init> */
169 utf *utf_clinit; /* <clinit> */
170 utf *utf_clone; /* clone */
171 utf *utf_finalize; /* finalize */
174 utf *utf_run; /* run */
180 utf *utf_removeThread;
183 utf *utf_uncaughtException;
186 utf *utf_fillInStackTrace;
188 utf *utf_getSystemClassLoader;
191 utf *utf_loadClassInternal;
192 utf *utf_printStackTrace;
194 utf *utf_division_by_zero;
205 utf *utf_void__void; /* ()V */
206 utf *utf_boolean__void; /* (Z)V */
207 utf *utf_byte__void; /* (B)V */
208 utf *utf_char__void; /* (C)V */
209 utf *utf_short__void; /* (S)V */
210 utf *utf_int__void; /* (I)V */
211 utf *utf_long__void; /* (J)V */
212 utf *utf_float__void; /* (F)V */
213 utf *utf_double__void; /* (D)V */
215 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
216 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
217 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
218 utf *utf_java_lang_ClassLoader_java_lang_String__J;
219 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
220 utf *utf_java_lang_Object__java_lang_Object;
221 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
222 utf *utf_java_lang_String__java_lang_Class;
223 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
224 utf *utf_java_lang_Thread_java_lang_Throwable__V;
225 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
226 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
227 utf *utf_java_lang_Throwable__java_lang_Throwable;
229 utf *utf_not_named_yet; /* special name for unnamed classes */
231 utf *array_packagename;
234 /* utf_init ********************************************************************
236 Initializes the utf8 subsystem.
238 *******************************************************************************/
242 TRACESUBSYSTEMINITIALIZATION("utf8_init");
244 /* create utf8 hashtable */
246 hashtable_utf = NEW(hashtable);
248 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
250 #if defined(ENABLE_STATISTICS)
252 count_utf_len += sizeof(utf*) * hashtable_utf->size;
255 /* create utf-symbols for pointer comparison of frequently used strings */
257 utf_java_lang_Object = utf_new_char("java/lang/Object");
259 utf_java_lang_Class = utf_new_char("java/lang/Class");
260 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
261 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
262 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
263 utf_java_lang_String = utf_new_char("java/lang/String");
264 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
266 utf_java_lang_ref_SoftReference =
267 utf_new_char("java/lang/ref/SoftReference");
269 utf_java_lang_ref_WeakReference =
270 utf_new_char("java/lang/ref/WeakReference");
272 utf_java_lang_ref_PhantomReference =
273 utf_new_char("java/lang/ref/PhantomReference");
275 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
277 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
278 utf_java_lang_Error = utf_new_char("java/lang/Error");
280 utf_java_lang_ClassCircularityError =
281 utf_new_char("java/lang/ClassCircularityError");
283 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
285 utf_java_lang_ExceptionInInitializerError =
286 utf_new_char("java/lang/ExceptionInInitializerError");
288 utf_java_lang_IncompatibleClassChangeError =
289 utf_new_char("java/lang/IncompatibleClassChangeError");
291 utf_java_lang_InstantiationError =
292 utf_new_char("java/lang/InstantiationError");
294 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
295 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
297 utf_java_lang_NoClassDefFoundError =
298 utf_new_char("java/lang/NoClassDefFoundError");
300 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
302 utf_java_lang_UnsatisfiedLinkError =
303 utf_new_char("java/lang/UnsatisfiedLinkError");
305 utf_java_lang_UnsupportedClassVersionError =
306 utf_new_char("java/lang/UnsupportedClassVersionError");
308 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
310 utf_java_lang_VirtualMachineError =
311 utf_new_char("java/lang/VirtualMachineError");
313 #if defined(ENABLE_JAVASE)
314 utf_java_lang_AbstractMethodError =
315 utf_new_char("java/lang/AbstractMethodError");
317 utf_java_lang_NoSuchFieldError =
318 utf_new_char("java/lang/NoSuchFieldError");
320 utf_java_lang_NoSuchMethodError =
321 utf_new_char("java/lang/NoSuchMethodError");
324 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
326 utf_java_lang_ArithmeticException =
327 utf_new_char("java/lang/ArithmeticException");
329 utf_java_lang_ArrayIndexOutOfBoundsException =
330 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
332 utf_java_lang_ArrayStoreException =
333 utf_new_char("java/lang/ArrayStoreException");
335 utf_java_lang_ClassCastException =
336 utf_new_char("java/lang/ClassCastException");
338 utf_java_lang_ClassNotFoundException =
339 utf_new_char("java/lang/ClassNotFoundException");
341 utf_java_lang_CloneNotSupportedException =
342 utf_new_char("java/lang/CloneNotSupportedException");
344 utf_java_lang_IllegalAccessException =
345 utf_new_char("java/lang/IllegalAccessException");
347 utf_java_lang_IllegalArgumentException =
348 utf_new_char("java/lang/IllegalArgumentException");
350 utf_java_lang_IllegalMonitorStateException =
351 utf_new_char("java/lang/IllegalMonitorStateException");
353 utf_java_lang_InstantiationException =
354 utf_new_char("java/lang/InstantiationException");
356 utf_java_lang_InterruptedException =
357 utf_new_char("java/lang/InterruptedException");
359 utf_java_lang_NegativeArraySizeException =
360 utf_new_char("java/lang/NegativeArraySizeException");
362 utf_java_lang_NullPointerException =
363 utf_new_char("java/lang/NullPointerException");
365 utf_java_lang_RuntimeException =
366 utf_new_char("java/lang/RuntimeException");
368 utf_java_lang_StringIndexOutOfBoundsException =
369 utf_new_char("java/lang/StringIndexOutOfBoundsException");
371 utf_java_lang_reflect_InvocationTargetException =
372 utf_new_char("java/lang/reflect/InvocationTargetException");
374 utf_java_security_PrivilegedActionException =
375 utf_new_char("java/security/PrivilegedActionException");
377 #if defined(ENABLE_JAVASE)
378 utf_java_lang_Void = utf_new_char("java/lang/Void");
381 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
382 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
383 utf_java_lang_Character = utf_new_char("java/lang/Character");
384 utf_java_lang_Short = utf_new_char("java/lang/Short");
385 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
386 utf_java_lang_Long = utf_new_char("java/lang/Long");
387 utf_java_lang_Float = utf_new_char("java/lang/Float");
388 utf_java_lang_Double = utf_new_char("java/lang/Double");
390 #if defined(ENABLE_JAVASE)
391 utf_java_lang_StackTraceElement =
392 utf_new_char("java/lang/StackTraceElement");
394 utf_java_lang_reflect_Constructor =
395 utf_new_char("java/lang/reflect/Constructor");
397 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
398 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
400 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
401 utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
402 utf_java_lang_reflect_VMField = utf_new_char("java/lang/reflect/VMField");
403 utf_java_lang_reflect_VMMethod = utf_new_char("java/lang/reflect/VMMethod");
406 utf_java_util_Vector = utf_new_char("java/util/Vector");
409 utf_InnerClasses = utf_new_char("InnerClasses");
410 utf_ConstantValue = utf_new_char("ConstantValue");
411 utf_Code = utf_new_char("Code");
412 utf_Exceptions = utf_new_char("Exceptions");
413 utf_LineNumberTable = utf_new_char("LineNumberTable");
414 utf_SourceFile = utf_new_char("SourceFile");
416 #if defined(ENABLE_JAVASE)
417 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
418 utf_Signature = utf_new_char("Signature");
419 utf_StackMapTable = utf_new_char("StackMapTable");
421 # if defined(ENABLE_JVMTI)
422 utf_LocalVariableTable = utf_new_char("LocalVariableTable");
425 # if defined(ENABLE_ANNOTATIONS)
426 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
427 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
428 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
429 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
430 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
434 utf_init = utf_new_char("<init>");
435 utf_clinit = utf_new_char("<clinit>");
436 utf_clone = utf_new_char("clone");
437 utf_finalize = utf_new_char("finalize");
438 utf_invoke = utf_new_char("invoke");
439 utf_main = utf_new_char("main");
440 utf_run = utf_new_char("run");
442 utf_add = utf_new_char("add");
443 utf_dispatch = utf_new_char("dispatch");
444 utf_remove = utf_new_char("remove");
445 utf_addThread = utf_new_char("addThread");
446 utf_removeThread = utf_new_char("removeThread");
447 utf_put = utf_new_char("put");
448 utf_get = utf_new_char("get");
449 utf_uncaughtException = utf_new_char("uncaughtException");
450 utf_value = utf_new_char("value");
452 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
453 utf_findNative = utf_new_char("findNative");
454 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
455 utf_initCause = utf_new_char("initCause");
456 utf_loadClass = utf_new_char("loadClass");
457 utf_loadClassInternal = utf_new_char("loadClassInternal");
458 utf_printStackTrace = utf_new_char("printStackTrace");
460 utf_division_by_zero = utf_new_char("/ by zero");
462 utf_Z = utf_new_char("Z");
463 utf_B = utf_new_char("B");
464 utf_C = utf_new_char("C");
465 utf_S = utf_new_char("S");
466 utf_I = utf_new_char("I");
467 utf_J = utf_new_char("J");
468 utf_F = utf_new_char("F");
469 utf_D = utf_new_char("D");
471 utf_void__void = utf_new_char("()V");
472 utf_boolean__void = utf_new_char("(Z)V");
473 utf_byte__void = utf_new_char("(B)V");
474 utf_char__void = utf_new_char("(C)V");
475 utf_short__void = utf_new_char("(S)V");
476 utf_int__void = utf_new_char("(I)V");
477 utf_long__void = utf_new_char("(J)V");
478 utf_float__void = utf_new_char("(F)V");
479 utf_double__void = utf_new_char("(D)V");
480 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
481 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
483 utf_void__java_lang_ClassLoader =
484 utf_new_char("()Ljava/lang/ClassLoader;");
486 utf_java_lang_ClassLoader_java_lang_String__J =
487 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
489 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
491 utf_java_lang_Object__java_lang_Object =
492 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
494 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
496 utf_java_lang_String__java_lang_Class =
497 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
499 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
501 utf_java_lang_Thread_java_lang_Throwable__V =
502 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
504 utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
505 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
507 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
509 utf_java_lang_Throwable__java_lang_Throwable =
510 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
512 utf_null = utf_new_char("null");
513 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
514 array_packagename = utf_new_char("\t<the array package>");
518 /* utf_hashkey *****************************************************************
520 The hashkey is computed from the utf-text by using up to 8
521 characters. For utf-symbols longer than 15 characters 3 characters
522 are taken from the beginning and the end, 2 characters are taken
525 *******************************************************************************/
527 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
528 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
530 u4 utf_hashkey(const char *text, u4 length)
532 const char *start_pos = text; /* pointer to utf text */
536 case 0: /* empty string */
539 case 1: return fbs(0);
540 case 2: return fbs(0) ^ nbs(3);
541 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
542 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
543 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
544 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
545 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
546 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
553 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
562 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
571 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
583 return a ^ nbs(9) ^ nbs(10);
595 return a ^ nbs(9) ^ nbs(10);
606 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
617 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
619 default: /* 3 characters from beginning */
625 /* 2 characters from middle */
626 text = start_pos + (length / 2);
631 /* 3 characters from end */
632 text = start_pos + length - 4;
637 return a ^ nbs(10) ^ nbs(11);
641 /* utf_full_hashkey ************************************************************
643 This function computes a hash value using all bytes in the string.
645 The algorithm is the "One-at-a-time" algorithm as published
646 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
648 *******************************************************************************/
650 u4 utf_full_hashkey(const char *text, u4 length)
652 register const unsigned char *p = (const unsigned char *) text;
660 hash += (hash << 10);
664 hash ^= (hash >> 11);
665 hash += (hash << 15);
670 /* unicode_hashkey *************************************************************
672 Compute the hashkey of a unicode string.
674 *******************************************************************************/
676 u4 unicode_hashkey(u2 *text, u2 len)
678 return utf_hashkey((char *) text, len);
682 /* utf_new *********************************************************************
684 Creates a new utf-symbol, the text of the symbol is passed as a
685 u1-array. The function searches the utf-hashtable for a utf-symbol
686 with this text. On success the element returned, otherwise a new
687 hashtable element is created.
689 If the number of entries in the hashtable exceeds twice the size of
690 the hashtable slots a reorganization of the hashtable is done and
691 the utf symbols are copied to a new hashtable with doubled size.
693 *******************************************************************************/
695 utf *utf_new(const char *text, u2 length)
697 u4 key; /* hashkey computed from utf-text */
698 u4 slot; /* slot in hashtable */
699 utf *u; /* hashtable element */
702 Mutex_lock(hashtable_utf->mutex);
704 #if defined(ENABLE_STATISTICS)
709 key = utf_hashkey(text, length);
710 slot = key & (hashtable_utf->size - 1);
711 u = hashtable_utf->ptr[slot];
713 /* search external hash chain for utf-symbol */
716 if (u->blength == length) {
717 /* compare text of hashtable elements */
719 for (i = 0; i < length; i++)
720 if (text[i] != u->text[i])
723 #if defined(ENABLE_STATISTICS)
725 count_utf_new_found++;
728 /* symbol found in hashtable */
730 Mutex_unlock(hashtable_utf->mutex);
736 u = u->hashlink; /* next element in external chain */
739 /* location in hashtable found, create new utf element */
743 u->blength = length; /* length in bytes of utfstring */
744 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
745 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
747 memcpy(u->text, text, length); /* copy utf-text */
748 u->text[length] = '\0';
750 #if defined(ENABLE_STATISTICS)
752 count_utf_len += sizeof(utf) + length + 1;
755 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
756 hashtable_utf->entries++; /* update number of entries */
758 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
760 /* reorganization of hashtable, average length of the external
761 chains is approx. 2 */
763 hashtable *newhash; /* the new hashtable */
769 /* create new hashtable, double the size */
771 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
773 #if defined(ENABLE_STATISTICS)
775 count_utf_len += sizeof(utf*) * hashtable_utf->size;
778 /* transfer elements to new hashtable */
780 for (i = 0; i < hashtable_utf->size; i++) {
781 u = hashtable_utf->ptr[i];
785 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
787 u->hashlink = (utf *) newhash->ptr[slot];
788 newhash->ptr[slot] = u;
790 /* follow link in external hash chain */
796 /* dispose old table */
798 hashtable_free(hashtable_utf);
800 hashtable_utf = newhash;
803 Mutex_unlock(hashtable_utf->mutex);
809 /* utf_new_u2 ******************************************************************
811 Make utf symbol from u2 array, if isclassname is true '.' is
814 *******************************************************************************/
816 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
818 char *buffer; /* memory buffer for unicode characters */
819 char *pos; /* pointer to current position in buffer */
820 u4 left; /* unicode characters left */
821 u4 buflength; /* utf length in bytes of the u2 array */
822 utf *result; /* resulting utf-string */
825 /* determine utf length in bytes and allocate memory */
827 buflength = u2_utflength(unicode_pos, unicode_length);
828 buffer = MNEW(char, buflength);
833 for (i = 0; i++ < unicode_length; unicode_pos++) {
834 /* next unicode character */
837 if ((c != 0) && (c < 0x80)) {
840 if ((int) left < 0) break;
841 /* convert classname */
842 if (isclassname && c == '.')
847 } else if (c < 0x800) {
849 unsigned char high = c >> 6;
850 unsigned char low = c & 0x3F;
852 if ((int) left < 0) break;
853 *pos++ = high | 0xC0;
859 char mid = (c >> 6) & 0x3F;
862 if ((int) left < 0) break;
863 *pos++ = high | 0xE0;
869 /* insert utf-string into symbol-table */
870 result = utf_new(buffer,buflength);
872 MFREE(buffer, char, buflength);
878 /* utf_new_char ****************************************************************
880 Creates a new utf symbol, the text for this symbol is passed as a
881 c-string ( = char* ).
883 *******************************************************************************/
885 utf *utf_new_char(const char *text)
887 return utf_new(text, strlen(text));
891 /* utf_new_char_classname ******************************************************
893 Creates a new utf symbol, the text for this symbol is passed as a
894 c-string ( = char* ) "." characters are going to be replaced by
895 "/". Since the above function is used often, this is a separte
896 function, instead of an if.
898 *******************************************************************************/
900 utf *utf_new_char_classname(const char *text)
902 if (strchr(text, '.')) {
903 char *txt = strdup(text);
904 char *end = txt + strlen(txt);
908 for (c = txt; c < end; c++)
909 if (*c == '.') *c = '/';
911 tmpRes = utf_new(txt, strlen(txt));
917 return utf_new(text, strlen(text));
921 /* utf_nextu2 ******************************************************************
923 Read the next unicode character from the utf string and increment
924 the utf-string pointer accordingly.
926 CAUTION: This function is unsafe for input that was not checked
929 *******************************************************************************/
931 u2 utf_nextu2(char **utf_ptr)
933 /* uncompressed unicode character */
935 /* current position in utf text */
936 unsigned char *utf = (unsigned char *) (*utf_ptr);
937 /* bytes representing the unicode character */
938 unsigned char ch1, ch2, ch3;
939 /* number of bytes used to represent the unicode character */
942 switch ((ch1 = utf[0]) >> 4) {
943 default: /* 1 byte */
947 case 0xD: /* 2 bytes */
948 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
949 unsigned char high = ch1 & 0x1F;
950 unsigned char low = ch2 & 0x3F;
951 unicode_char = (high << 6) + low;
956 case 0xE: /* 2 or 3 bytes */
957 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
958 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
959 unsigned char low = ch3 & 0x3f;
960 unsigned char mid = ch2 & 0x3f;
961 unsigned char high = ch1 & 0x0f;
962 unicode_char = (((high << 6) + mid) << 6) + low;
970 /* update position in utf-text */
971 *utf_ptr = (char *) (utf + len);
977 /* utf_bytes *******************************************************************
979 Determine number of bytes (aka. octets) in the utf string.
982 u............utf string
985 The number of octets of this utf string.
986 There is _no_ terminating zero included in this count.
988 *******************************************************************************/
996 /* utf_get_number_of_u2s_for_buffer ********************************************
998 Determine number of UTF-16 u2s in the given UTF-8 buffer
1000 CAUTION: This function is unsafe for input that was not checked
1003 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
1004 to an array of u2s (UTF-16) and want to know how many of them you will get.
1005 All other uses of this function are probably wrong.
1008 buffer........points to first char in buffer
1009 blength.......number of _bytes_ in the buffer
1012 the number of u2s needed to hold this string in UTF-16 encoding.
1013 There is _no_ terminating zero included in this count.
1015 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1018 *******************************************************************************/
1020 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1022 const char *endpos; /* points behind utf string */
1023 const char *utf_ptr; /* current position in utf text */
1024 u4 len = 0; /* number of unicode characters */
1027 endpos = utf_ptr + blength;
1029 while (utf_ptr < endpos) {
1031 /* next unicode character */
1032 utf_nextu2((char **)&utf_ptr);
1035 assert(utf_ptr == endpos);
1041 /* utf_get_number_of_u2s *******************************************************
1043 Determine number of UTF-16 u2s in the utf string.
1045 CAUTION: This function is unsafe for input that was not checked
1048 CAUTION: Use this function *only* when you want to convert a utf string
1049 to an array of u2s and want to know how many of them you will get.
1050 All other uses of this function are probably wrong.
1053 u............utf string
1056 the number of u2s needed to hold this string in UTF-16 encoding.
1057 There is _no_ terminating zero included in this count.
1058 XXX 0 if a NullPointerException has been thrown (see below)
1060 *******************************************************************************/
1062 u4 utf_get_number_of_u2s(utf *u)
1064 char *endpos; /* points behind utf string */
1065 char *utf_ptr; /* current position in utf text */
1066 u4 len = 0; /* number of unicode characters */
1068 /* XXX this is probably not checked by most callers! Review this after */
1069 /* the invalid uses of this function have been eliminated */
1071 exceptions_throw_nullpointerexception();
1075 endpos = UTF_END(u);
1078 while (utf_ptr < endpos) {
1080 /* next unicode character */
1081 utf_nextu2(&utf_ptr);
1084 if (utf_ptr != endpos) {
1085 /* string ended abruptly */
1086 exceptions_throw_internalerror("Illegal utf8 string");
1094 /* utf8_safe_number_of_u2s *****************************************************
1096 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1097 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1099 This function is safe even for invalid UTF-8 strings.
1102 text..........zero-terminated(!) UTF-8 string (may be invalid)
1104 nbytes........strlen(text). (This is needed to completely emulate
1108 the number of u2s needed to hold this string in UTF-16 encoding.
1109 There is _no_ terminating zero included in this count.
1111 *******************************************************************************/
1113 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1114 register const unsigned char *t;
1117 register const unsigned char *tlimit;
1125 assert(nbytes >= 0);
1128 t = (const unsigned char *) text;
1129 tlimit = t + nbytes;
1131 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1137 /* highest bit set, non-ASCII character */
1139 if ((byte & 0xe0) == 0xc0) {
1140 /* 2-byte: should be 110..... 10...... ? */
1142 if ((*t++ & 0xc0) == 0x80)
1143 ; /* valid 2-byte */
1147 else if ((byte & 0xf0) == 0xe0) {
1148 /* 3-byte: should be 1110.... 10...... 10...... */
1152 return len + 1; /* invalid, stop here */
1154 if ((*t++ & 0xc0) == 0x80) {
1155 if ((*t++ & 0xc0) == 0x80)
1156 ; /* valid 3-byte */
1163 else if ((byte & 0xf8) == 0xf0) {
1164 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1168 return len + 1; /* invalid, stop here */
1170 if (((byte1 = *t++) & 0xc0) == 0x80) {
1171 if (((byte2 = *t++) & 0xc0) == 0x80) {
1172 if (((byte3 = *t++) & 0xc0) == 0x80) {
1173 /* valid 4-byte UTF-8? */
1174 value = ((byte & 0x07) << 18)
1175 | ((byte1 & 0x3f) << 12)
1176 | ((byte2 & 0x3f) << 6)
1177 | ((byte3 & 0x3f) );
1179 if (value > 0x10FFFF)
1181 else if (value > 0xFFFF)
1182 len += 1; /* we need surrogates */
1184 ; /* 16bit suffice */
1195 else if ((byte & 0xfc) == 0xf8) {
1196 /* invalid 5-byte */
1198 return len + 1; /* invalid, stop here */
1201 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1204 else if ((byte & 0xfe) == 0xfc) {
1205 /* invalid 6-byte */
1207 return len + 1; /* invalid, stop here */
1210 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1222 /* ASCII character, common case */
1232 /* utf8_safe_convert_to_u2s ****************************************************
1234 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1235 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1236 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1238 This function is safe even for invalid UTF-8 strings.
1241 text..........zero-terminated(!) UTF-8 string (may be invalid)
1243 nbytes........strlen(text). (This is needed to completely emulate
1245 buffer........a preallocated array of u2s to receive the decoded
1246 string. Use utf8_safe_number_of_u2s to get the
1247 required number of u2s for allocating this.
1249 *******************************************************************************/
1251 #define UNICODE_REPLACEMENT 0xfffd
1253 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1254 register const unsigned char *t;
1256 register const unsigned char *tlimit;
1264 assert(nbytes >= 0);
1266 t = (const unsigned char *) text;
1267 tlimit = t + nbytes;
1269 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1275 /* highest bit set, non-ASCII character */
1277 if ((byte & 0xe0) == 0xc0) {
1278 /* 2-byte: should be 110..... 10...... */
1280 if (((byte1 = *t++) & 0xc0) == 0x80) {
1281 /* valid 2-byte UTF-8 */
1282 *buffer++ = ((byte & 0x1f) << 6)
1283 | ((byte1 & 0x3f) );
1286 *buffer++ = UNICODE_REPLACEMENT;
1290 else if ((byte & 0xf0) == 0xe0) {
1291 /* 3-byte: should be 1110.... 10...... 10...... */
1293 if (t + 2 > tlimit) {
1294 *buffer++ = UNICODE_REPLACEMENT;
1298 if (((byte1 = *t++) & 0xc0) == 0x80) {
1299 if (((byte2 = *t++) & 0xc0) == 0x80) {
1300 /* valid 3-byte UTF-8 */
1301 *buffer++ = ((byte & 0x0f) << 12)
1302 | ((byte1 & 0x3f) << 6)
1303 | ((byte2 & 0x3f) );
1306 *buffer++ = UNICODE_REPLACEMENT;
1311 *buffer++ = UNICODE_REPLACEMENT;
1315 else if ((byte & 0xf8) == 0xf0) {
1316 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1318 if (t + 3 > tlimit) {
1319 *buffer++ = UNICODE_REPLACEMENT;
1323 if (((byte1 = *t++) & 0xc0) == 0x80) {
1324 if (((byte2 = *t++) & 0xc0) == 0x80) {
1325 if (((byte3 = *t++) & 0xc0) == 0x80) {
1326 /* valid 4-byte UTF-8? */
1327 value = ((byte & 0x07) << 18)
1328 | ((byte1 & 0x3f) << 12)
1329 | ((byte2 & 0x3f) << 6)
1330 | ((byte3 & 0x3f) );
1332 if (value > 0x10FFFF) {
1333 *buffer++ = UNICODE_REPLACEMENT;
1335 else if (value > 0xFFFF) {
1336 /* we need surrogates */
1337 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1338 *buffer++ = 0xdc00 | (value & 0x03ff);
1341 *buffer++ = value; /* 16bit suffice */
1344 *buffer++ = UNICODE_REPLACEMENT;
1349 *buffer++ = UNICODE_REPLACEMENT;
1354 *buffer++ = UNICODE_REPLACEMENT;
1358 else if ((byte & 0xfc) == 0xf8) {
1359 if (t + 4 > tlimit) {
1360 *buffer++ = UNICODE_REPLACEMENT;
1365 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1367 *buffer++ = UNICODE_REPLACEMENT;
1369 else if ((byte & 0xfe) == 0xfc) {
1370 if (t + 5 > tlimit) {
1371 *buffer++ = UNICODE_REPLACEMENT;
1376 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1378 *buffer++ = UNICODE_REPLACEMENT;
1381 *buffer++ = UNICODE_REPLACEMENT;
1389 /* ASCII character, common case */
1397 /* u2_utflength ****************************************************************
1399 Returns the utf length in bytes of a u2 array.
1401 *******************************************************************************/
1403 u4 u2_utflength(u2 *text, u4 u2_length)
1405 u4 result_len = 0; /* utf length in bytes */
1406 u2 ch; /* current unicode character */
1409 for (len = 0; len < u2_length; len++) {
1410 /* next unicode character */
1413 /* determine bytes required to store unicode character as utf */
1414 if (ch && (ch < 0x80))
1416 else if (ch < 0x800)
1426 /* utf_copy ********************************************************************
1428 Copy the given utf string byte-for-byte to a buffer.
1431 buffer.......the buffer
1432 u............the utf string
1434 *******************************************************************************/
1436 void utf_copy(char *buffer, utf *u)
1438 /* our utf strings are zero-terminated (done by utf_new) */
1439 MCOPY(buffer, u->text, char, u->blength + 1);
1443 /* utf_cat *********************************************************************
1445 Append the given utf string byte-for-byte to a buffer.
1448 buffer.......the buffer
1449 u............the utf string
1451 *******************************************************************************/
1453 void utf_cat(char *buffer, utf *u)
1455 /* our utf strings are zero-terminated (done by utf_new) */
1456 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1460 /* utf_copy_classname **********************************************************
1462 Copy the given utf classname byte-for-byte to a buffer.
1463 '/' is replaced by '.'
1466 buffer.......the buffer
1467 u............the utf string
1469 *******************************************************************************/
1471 void utf_copy_classname(char *buffer, utf *u)
1480 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1482 while (srcptr != endptr) {
1491 /* utf_cat *********************************************************************
1493 Append the given utf classname byte-for-byte to a buffer.
1494 '/' is replaced by '.'
1497 buffer.......the buffer
1498 u............the utf string
1500 *******************************************************************************/
1502 void utf_cat_classname(char *buffer, utf *u)
1504 utf_copy_classname(buffer + strlen(buffer), u);
1507 /* utf_display_printable_ascii *************************************************
1509 Write utf symbol to stdout (for debugging purposes).
1510 Non-printable and non-ASCII characters are printed as '?'.
1512 *******************************************************************************/
1514 void utf_display_printable_ascii(utf *u)
1516 char *endpos; /* points behind utf string */
1517 char *utf_ptr; /* current position in utf text */
1525 endpos = UTF_END(u);
1528 while (utf_ptr < endpos) {
1529 /* read next unicode character */
1531 u2 c = utf_nextu2(&utf_ptr);
1533 if ((c >= 32) && (c <= 127))
1543 /* utf_display_printable_ascii_classname ***************************************
1545 Write utf symbol to stdout with `/' converted to `.' (for debugging
1547 Non-printable and non-ASCII characters are printed as '?'.
1549 *******************************************************************************/
1551 void utf_display_printable_ascii_classname(utf *u)
1553 char *endpos; /* points behind utf string */
1554 char *utf_ptr; /* current position in utf text */
1562 endpos = UTF_END(u);
1565 while (utf_ptr < endpos) {
1566 /* read next unicode character */
1568 u2 c = utf_nextu2(&utf_ptr);
1573 if ((c >= 32) && (c <= 127))
1583 /* utf_sprint_convert_to_latin1 ************************************************
1585 Write utf symbol into c-string (for debugging purposes).
1586 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1589 *******************************************************************************/
1591 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1593 char *endpos; /* points behind utf string */
1594 char *utf_ptr; /* current position in utf text */
1595 u2 pos = 0; /* position in c-string */
1598 strcpy(buffer, "NULL");
1602 endpos = UTF_END(u);
1605 while (utf_ptr < endpos)
1606 /* copy next unicode character */
1607 buffer[pos++] = utf_nextu2(&utf_ptr);
1609 /* terminate string */
1614 /* utf_sprint_convert_to_latin1_classname **************************************
1616 Write utf symbol into c-string with `/' converted to `.' (for debugging
1618 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1621 *******************************************************************************/
1623 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1625 char *endpos; /* points behind utf string */
1626 char *utf_ptr; /* current position in utf text */
1627 u2 pos = 0; /* position in c-string */
1630 strcpy(buffer, "NULL");
1634 endpos = UTF_END(u);
1637 while (utf_ptr < endpos) {
1638 /* copy next unicode character */
1639 u2 c = utf_nextu2(&utf_ptr);
1640 if (c == '/') c = '.';
1644 /* terminate string */
1649 /* utf_strcat_convert_to_latin1 ************************************************
1651 Like libc strcat, but uses an utf8 string.
1652 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1655 *******************************************************************************/
1657 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1659 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1663 /* utf_strcat_convert_to_latin1_classname **************************************
1665 Like libc strcat, but uses an utf8 string.
1666 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1669 *******************************************************************************/
1671 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1673 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1677 /* utf_fprint_printable_ascii **************************************************
1679 Write utf symbol into file.
1680 Non-printable and non-ASCII characters are printed as '?'.
1682 *******************************************************************************/
1684 void utf_fprint_printable_ascii(FILE *file, utf *u)
1686 char *endpos; /* points behind utf string */
1687 char *utf_ptr; /* current position in utf text */
1692 endpos = UTF_END(u);
1695 while (utf_ptr < endpos) {
1696 /* read next unicode character */
1697 u2 c = utf_nextu2(&utf_ptr);
1699 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1700 else fprintf(file, "?");
1705 /* utf_fprint_printable_ascii_classname ****************************************
1707 Write utf symbol into file with `/' converted to `.'.
1708 Non-printable and non-ASCII characters are printed as '?'.
1710 *******************************************************************************/
1712 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1714 char *endpos; /* points behind utf string */
1715 char *utf_ptr; /* current position in utf text */
1720 endpos = UTF_END(u);
1723 while (utf_ptr < endpos) {
1724 /* read next unicode character */
1725 u2 c = utf_nextu2(&utf_ptr);
1726 if (c == '/') c = '.';
1728 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1729 else fprintf(file, "?");
1734 /* is_valid_utf ****************************************************************
1736 Return true if the given string is a valid UTF-8 string.
1738 utf_ptr...points to first character
1739 end_pos...points after last character
1741 *******************************************************************************/
1743 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1745 bool is_valid_utf(char *utf_ptr, char *end_pos)
1752 if (end_pos < utf_ptr) return false;
1753 bytes = end_pos - utf_ptr;
1757 if (!c) return false; /* 0x00 is not allowed */
1758 if ((c & 0x80) == 0) continue; /* ASCII */
1760 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1761 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1762 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1763 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1764 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1765 else return false; /* invalid leading byte */
1767 if (len > 2) return false; /* Java limitation */
1769 v = (unsigned long)c & (0x3f >> len);
1771 if ((bytes -= len) < 0) return false; /* missing bytes */
1773 for (i = len; i--; ) {
1775 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1777 v = (v << 6) | (c & 0x3f);
1781 if (len != 1) return false; /* Java special */
1784 /* Sun Java seems to allow overlong UTF-8 encodings */
1786 /* if (v < min_codepoint[len]) */
1787 /* XXX throw exception? */
1790 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1791 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1793 /* even these seem to be allowed */
1794 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1801 /* is_valid_name ***************************************************************
1803 Return true if the given string may be used as a class/field/method
1804 name. (Currently this only disallows empty strings and control
1807 NOTE: The string is assumed to have passed is_valid_utf!
1809 utf_ptr...points to first character
1810 end_pos...points after last character
1812 *******************************************************************************/
1814 bool is_valid_name(char *utf_ptr, char *end_pos)
1816 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1818 while (utf_ptr < end_pos) {
1819 unsigned char c = *utf_ptr++;
1821 if (c < 0x20) return false; /* disallow control characters */
1822 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1829 bool is_valid_name_utf(utf *u)
1831 return is_valid_name(u->text, UTF_END(u));
1835 /* utf_show ********************************************************************
1837 Writes the utf symbols in the utfhash to stdout and displays the
1838 number of external hash chains grouped according to the chainlength
1839 (for debugging purposes).
1841 *******************************************************************************/
1843 #if !defined(NDEBUG)
1847 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1849 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1850 u4 max_chainlength = 0; /* maximum length of the chains */
1851 u4 sum_chainlength = 0; /* sum of the chainlengths */
1852 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1855 printf("UTF-HASH:\n");
1857 /* show element of utf-hashtable */
1859 for (i = 0; i < hashtable_utf->size; i++) {
1860 utf *u = hashtable_utf->ptr[i];
1863 printf("SLOT %d: ", (int) i);
1867 utf_display_printable_ascii(u);
1875 printf("UTF-HASH: %d slots for %d entries\n",
1876 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1878 if (hashtable_utf->entries == 0)
1881 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1883 for (i=0;i<CHAIN_LIMIT;i++)
1886 /* count numbers of hashchains according to their length */
1887 for (i=0; i<hashtable_utf->size; i++) {
1889 utf *u = (utf*) hashtable_utf->ptr[i];
1890 u4 chain_length = 0;
1892 /* determine chainlength */
1898 /* update sum of all chainlengths */
1899 sum_chainlength+=chain_length;
1901 /* determine the maximum length of the chains */
1902 if (chain_length>max_chainlength)
1903 max_chainlength = chain_length;
1905 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1906 if (chain_length>=CHAIN_LIMIT) {
1907 beyond_limit+=chain_length;
1908 chain_length=CHAIN_LIMIT-1;
1911 /* update number of hashchains of current length */
1912 chain_count[chain_length]++;
1915 /* display results */
1916 for (i=1;i<CHAIN_LIMIT-1;i++)
1917 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1919 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1922 printf("max. chainlength:%5d\n",max_chainlength);
1924 /* avg. chainlength = sum of chainlengths / number of chains */
1925 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1927 #endif /* !defined(NDEBUG) */
1931 * These are local overrides for various environment variables in Emacs.
1932 * Please do not remove this and leave it at the end of the file, where
1933 * Emacs will automagically detect them.
1934 * ---------------------------------------------------------------------
1937 * indent-tabs-mode: t
1941 * vim:noexpandtab:sw=4:ts=4: