1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 $Id: utf8.c 8367 2007-08-20 20:26:16Z twisti $
37 #include "mm/memory.h"
39 #include "threads/lock-common.h"
41 #include "toolbox/hashtable.h"
43 #include "vm/exceptions.h"
45 #include "vmcore/options.h"
47 #if defined(ENABLE_STATISTICS)
48 # include "vmcore/statistics.h"
51 #include "vmcore/utf8.h"
54 /* global variables ***********************************************************/
56 /* hashsize must be power of 2 */
58 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
60 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
63 /* utf-symbols for pointer comparison of frequently used strings **************/
65 utf *utf_java_lang_Object;
67 utf *utf_java_lang_Class;
68 utf *utf_java_lang_ClassLoader;
69 utf *utf_java_lang_Cloneable;
70 utf *utf_java_lang_SecurityManager;
71 utf *utf_java_lang_String;
72 utf *utf_java_lang_System;
73 utf *utf_java_lang_ThreadGroup;
74 utf *utf_java_lang_ref_SoftReference;
75 utf *utf_java_lang_ref_WeakReference;
76 utf *utf_java_lang_ref_PhantomReference;
77 utf *utf_java_io_Serializable;
79 utf *utf_java_lang_Throwable;
80 utf *utf_java_lang_Error;
82 utf *utf_java_lang_AbstractMethodError;
83 utf *utf_java_lang_ClassCircularityError;
84 utf *utf_java_lang_ClassFormatError;
85 utf *utf_java_lang_ExceptionInInitializerError;
86 utf *utf_java_lang_IncompatibleClassChangeError;
87 utf *utf_java_lang_InstantiationError;
88 utf *utf_java_lang_InternalError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchFieldError;
92 utf *utf_java_lang_NoSuchMethodError;
93 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_UnsatisfiedLinkError;
95 utf *utf_java_lang_UnsupportedClassVersionError;
96 utf *utf_java_lang_VerifyError;
97 utf *utf_java_lang_VirtualMachineError;
99 #if defined(WITH_CLASSPATH_GNU)
100 utf *utf_java_lang_VMThrowable;
103 utf *utf_java_lang_Exception;
105 utf *utf_java_lang_ArithmeticException;
106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
107 utf *utf_java_lang_ArrayStoreException;
108 utf *utf_java_lang_ClassCastException;
109 utf *utf_java_lang_ClassNotFoundException;
110 utf *utf_java_lang_CloneNotSupportedException;
111 utf *utf_java_lang_IllegalAccessException;
112 utf *utf_java_lang_IllegalArgumentException;
113 utf *utf_java_lang_IllegalMonitorStateException;
114 utf *utf_java_lang_InstantiationException;
115 utf *utf_java_lang_InterruptedException;
116 utf *utf_java_lang_NegativeArraySizeException;
117 utf *utf_java_lang_NullPointerException;
118 utf *utf_java_lang_StringIndexOutOfBoundsException;
120 utf *utf_java_lang_reflect_InvocationTargetException;
122 utf *utf_java_security_PrivilegedActionException;
124 #if defined(ENABLE_JAVASE)
125 utf* utf_java_lang_Void;
128 utf* utf_java_lang_Boolean;
129 utf* utf_java_lang_Byte;
130 utf* utf_java_lang_Character;
131 utf* utf_java_lang_Short;
132 utf* utf_java_lang_Integer;
133 utf* utf_java_lang_Long;
134 utf* utf_java_lang_Float;
135 utf* utf_java_lang_Double;
137 #if defined(ENABLE_JAVASE)
138 utf *utf_java_lang_StackTraceElement;
139 utf *utf_java_lang_reflect_Constructor;
140 utf *utf_java_lang_reflect_Field;
141 utf *utf_java_lang_reflect_Method;
142 utf *utf_java_util_Vector;
145 utf *utf_InnerClasses; /* InnerClasses */
146 utf *utf_ConstantValue; /* ConstantValue */
147 utf *utf_Code; /* Code */
148 utf *utf_Exceptions; /* Exceptions */
149 utf *utf_LineNumberTable; /* LineNumberTable */
150 utf *utf_SourceFile; /* SourceFile */
152 #if defined(ENABLE_JAVASE)
153 utf *utf_EnclosingMethod;
155 utf *utf_StackMapTable;
157 #if defined(ENABLE_ANNOTATIONS)
158 utf *utf_sun_reflect_ConstantPool;
159 #if defined(WITH_CLASSPATH_GNU)
160 utf *utf_sun_reflect_annotation_AnnotationParser;
163 utf *utf_RuntimeVisibleAnnotations;
164 utf *utf_RuntimeInvisibleAnnotations;
165 utf *utf_RuntimeVisibleParameterAnnotations;
166 utf *utf_RuntimeInvisibleParameterAnnotations;
167 utf *utf_AnnotationDefault;
171 utf *utf_init; /* <init> */
172 utf *utf_clinit; /* <clinit> */
173 utf *utf_clone; /* clone */
174 utf *utf_finalize; /* finalize */
175 utf *utf_run; /* run */
180 utf *utf_removeThread;
183 utf *utf_uncaughtException;
186 utf *utf_fillInStackTrace;
188 utf *utf_getSystemClassLoader;
191 utf *utf_printStackTrace;
193 utf *utf_division_by_zero;
204 utf *utf_void__void; /* ()V */
205 utf *utf_boolean__void; /* (Z)V */
206 utf *utf_byte__void; /* (B)V */
207 utf *utf_char__void; /* (C)V */
208 utf *utf_short__void; /* (S)V */
209 utf *utf_int__void; /* (I)V */
210 utf *utf_long__void; /* (J)V */
211 utf *utf_float__void; /* (F)V */
212 utf *utf_double__void; /* (D)V */
214 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
215 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
216 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
217 utf *utf_java_lang_ClassLoader_java_lang_String__J;
218 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
219 utf *utf_java_lang_Object__java_lang_Object;
220 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
221 utf *utf_java_lang_String__java_lang_Class;
222 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
223 utf *utf_java_lang_Thread_java_lang_Throwable__V;
224 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
225 utf *utf_java_lang_Throwable__java_lang_Throwable;
227 utf *utf_not_named_yet; /* special name for unnamed classes */
229 utf *array_packagename;
232 /* utf_init ********************************************************************
234 Initializes the utf8 subsystem.
236 *******************************************************************************/
240 /* create utf8 hashtable */
242 hashtable_utf = NEW(hashtable);
244 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
246 #if defined(ENABLE_STATISTICS)
248 count_utf_len += sizeof(utf*) * hashtable_utf->size;
251 /* create utf-symbols for pointer comparison of frequently used strings */
253 utf_java_lang_Object = utf_new_char("java/lang/Object");
255 utf_java_lang_Class = utf_new_char("java/lang/Class");
256 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
257 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
258 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
259 utf_java_lang_String = utf_new_char("java/lang/String");
260 utf_java_lang_System = utf_new_char("java/lang/System");
261 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
263 utf_java_lang_ref_SoftReference =
264 utf_new_char("java/lang/ref/SoftReference");
266 utf_java_lang_ref_WeakReference =
267 utf_new_char("java/lang/ref/WeakReference");
269 utf_java_lang_ref_PhantomReference =
270 utf_new_char("java/lang/ref/PhantomReference");
272 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
274 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
275 utf_java_lang_Error = utf_new_char("java/lang/Error");
277 utf_java_lang_ClassCircularityError =
278 utf_new_char("java/lang/ClassCircularityError");
280 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
282 utf_java_lang_ExceptionInInitializerError =
283 utf_new_char("java/lang/ExceptionInInitializerError");
285 utf_java_lang_IncompatibleClassChangeError =
286 utf_new_char("java/lang/IncompatibleClassChangeError");
288 utf_java_lang_InstantiationError =
289 utf_new_char("java/lang/InstantiationError");
291 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
292 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
294 utf_java_lang_NoClassDefFoundError =
295 utf_new_char("java/lang/NoClassDefFoundError");
297 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
299 utf_java_lang_UnsatisfiedLinkError =
300 utf_new_char("java/lang/UnsatisfiedLinkError");
302 utf_java_lang_UnsupportedClassVersionError =
303 utf_new_char("java/lang/UnsupportedClassVersionError");
305 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
307 utf_java_lang_VirtualMachineError =
308 utf_new_char("java/lang/VirtualMachineError");
310 #if defined(ENABLE_JAVASE)
311 utf_java_lang_AbstractMethodError =
312 utf_new_char("java/lang/AbstractMethodError");
314 utf_java_lang_NoSuchFieldError =
315 utf_new_char("java/lang/NoSuchFieldError");
317 utf_java_lang_NoSuchMethodError =
318 utf_new_char("java/lang/NoSuchMethodError");
321 #if defined(WITH_CLASSPATH_GNU)
322 utf_java_lang_VMThrowable = utf_new_char("java/lang/VMThrowable");
325 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
327 utf_java_lang_ArithmeticException =
328 utf_new_char("java/lang/ArithmeticException");
330 utf_java_lang_ArrayIndexOutOfBoundsException =
331 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
333 utf_java_lang_ArrayStoreException =
334 utf_new_char("java/lang/ArrayStoreException");
336 utf_java_lang_ClassCastException =
337 utf_new_char("java/lang/ClassCastException");
339 utf_java_lang_ClassNotFoundException =
340 utf_new_char("java/lang/ClassNotFoundException");
342 utf_java_lang_CloneNotSupportedException =
343 utf_new_char("java/lang/CloneNotSupportedException");
345 utf_java_lang_IllegalAccessException =
346 utf_new_char("java/lang/IllegalAccessException");
348 utf_java_lang_IllegalArgumentException =
349 utf_new_char("java/lang/IllegalArgumentException");
351 utf_java_lang_IllegalMonitorStateException =
352 utf_new_char("java/lang/IllegalMonitorStateException");
354 utf_java_lang_InstantiationException =
355 utf_new_char("java/lang/InstantiationException");
357 utf_java_lang_InterruptedException =
358 utf_new_char("java/lang/InterruptedException");
360 utf_java_lang_NegativeArraySizeException =
361 utf_new_char("java/lang/NegativeArraySizeException");
363 utf_java_lang_NullPointerException =
364 utf_new_char("java/lang/NullPointerException");
366 utf_java_lang_StringIndexOutOfBoundsException =
367 utf_new_char("java/lang/StringIndexOutOfBoundsException");
369 utf_java_lang_reflect_InvocationTargetException =
370 utf_new_char("java/lang/reflect/InvocationTargetException");
372 utf_java_security_PrivilegedActionException =
373 utf_new_char("java/security/PrivilegedActionException");
375 #if defined(ENABLE_JAVASE)
376 utf_java_lang_Void = utf_new_char("java/lang/Void");
379 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
380 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
381 utf_java_lang_Character = utf_new_char("java/lang/Character");
382 utf_java_lang_Short = utf_new_char("java/lang/Short");
383 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
384 utf_java_lang_Long = utf_new_char("java/lang/Long");
385 utf_java_lang_Float = utf_new_char("java/lang/Float");
386 utf_java_lang_Double = utf_new_char("java/lang/Double");
388 #if defined(ENABLE_JAVASE)
389 utf_java_lang_StackTraceElement =
390 utf_new_char("java/lang/StackTraceElement");
392 utf_java_lang_reflect_Constructor =
393 utf_new_char("java/lang/reflect/Constructor");
395 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
396 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
397 utf_java_util_Vector = utf_new_char("java/util/Vector");
400 utf_InnerClasses = utf_new_char("InnerClasses");
401 utf_ConstantValue = utf_new_char("ConstantValue");
402 utf_Code = utf_new_char("Code");
403 utf_Exceptions = utf_new_char("Exceptions");
404 utf_LineNumberTable = utf_new_char("LineNumberTable");
405 utf_SourceFile = utf_new_char("SourceFile");
407 #if defined(ENABLE_JAVASE)
408 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
409 utf_Signature = utf_new_char("Signature");
410 utf_StackMapTable = utf_new_char("StackMapTable");
412 #if defined(ENABLE_ANNOTATIONS)
413 utf_sun_reflect_ConstantPool = utf_new_char("sun/reflect/ConstantPool");
414 #if defined(WITH_CLASSPATH_GNU)
415 utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
418 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
419 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
420 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
421 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
422 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
426 utf_init = utf_new_char("<init>");
427 utf_clinit = utf_new_char("<clinit>");
428 utf_clone = utf_new_char("clone");
429 utf_finalize = utf_new_char("finalize");
430 utf_run = utf_new_char("run");
432 utf_add = utf_new_char("add");
433 utf_remove = utf_new_char("remove");
434 utf_addThread = utf_new_char("addThread");
435 utf_removeThread = utf_new_char("removeThread");
436 utf_put = utf_new_char("put");
437 utf_get = utf_new_char("get");
438 utf_uncaughtException = utf_new_char("uncaughtException");
439 utf_value = utf_new_char("value");
441 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
442 utf_findNative = utf_new_char("findNative");
443 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
444 utf_initCause = utf_new_char("initCause");
445 utf_loadClass = utf_new_char("loadClass");
446 utf_printStackTrace = utf_new_char("printStackTrace");
448 utf_division_by_zero = utf_new_char("/ by zero");
450 utf_Z = utf_new_char("Z");
451 utf_B = utf_new_char("B");
452 utf_C = utf_new_char("C");
453 utf_S = utf_new_char("S");
454 utf_I = utf_new_char("I");
455 utf_J = utf_new_char("J");
456 utf_F = utf_new_char("F");
457 utf_D = utf_new_char("D");
459 utf_void__void = utf_new_char("()V");
460 utf_boolean__void = utf_new_char("(Z)V");
461 utf_byte__void = utf_new_char("(B)V");
462 utf_char__void = utf_new_char("(C)V");
463 utf_short__void = utf_new_char("(S)V");
464 utf_int__void = utf_new_char("(I)V");
465 utf_long__void = utf_new_char("(J)V");
466 utf_float__void = utf_new_char("(F)V");
467 utf_double__void = utf_new_char("(D)V");
468 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
469 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
471 utf_void__java_lang_ClassLoader =
472 utf_new_char("()Ljava/lang/ClassLoader;");
474 utf_java_lang_ClassLoader_java_lang_String__J =
475 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
477 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
479 utf_java_lang_Object__java_lang_Object =
480 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
482 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
484 utf_java_lang_String__java_lang_Class =
485 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
487 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
489 utf_java_lang_Thread_java_lang_Throwable__V =
490 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
492 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
494 utf_java_lang_Throwable__java_lang_Throwable =
495 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
497 utf_null = utf_new_char("null");
498 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
499 array_packagename = utf_new_char("\t<the array package>");
501 /* everything's ok */
507 /* utf_hashkey *****************************************************************
509 The hashkey is computed from the utf-text by using up to 8
510 characters. For utf-symbols longer than 15 characters 3 characters
511 are taken from the beginning and the end, 2 characters are taken
514 *******************************************************************************/
516 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
517 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
519 u4 utf_hashkey(const char *text, u4 length)
521 const char *start_pos = text; /* pointer to utf text */
525 case 0: /* empty string */
528 case 1: return fbs(0);
529 case 2: return fbs(0) ^ nbs(3);
530 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
531 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
532 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
533 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
534 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
535 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
542 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
551 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
560 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
572 return a ^ nbs(9) ^ nbs(10);
584 return a ^ nbs(9) ^ nbs(10);
595 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
606 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
608 default: /* 3 characters from beginning */
614 /* 2 characters from middle */
615 text = start_pos + (length / 2);
620 /* 3 characters from end */
621 text = start_pos + length - 4;
626 return a ^ nbs(10) ^ nbs(11);
630 /* utf_full_hashkey ************************************************************
632 This function computes a hash value using all bytes in the string.
634 The algorithm is the "One-at-a-time" algorithm as published
635 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
637 *******************************************************************************/
639 u4 utf_full_hashkey(const char *text, u4 length)
641 register const unsigned char *p = (const unsigned char *) text;
649 hash += (hash << 10);
653 hash ^= (hash >> 11);
654 hash += (hash << 15);
659 /* unicode_hashkey *************************************************************
661 Compute the hashkey of a unicode string.
663 *******************************************************************************/
665 u4 unicode_hashkey(u2 *text, u2 len)
667 return utf_hashkey((char *) text, len);
671 /* utf_new *********************************************************************
673 Creates a new utf-symbol, the text of the symbol is passed as a
674 u1-array. The function searches the utf-hashtable for a utf-symbol
675 with this text. On success the element returned, otherwise a new
676 hashtable element is created.
678 If the number of entries in the hashtable exceeds twice the size of
679 the hashtable slots a reorganization of the hashtable is done and
680 the utf symbols are copied to a new hashtable with doubled size.
682 *******************************************************************************/
684 utf *utf_new(const char *text, u2 length)
686 u4 key; /* hashkey computed from utf-text */
687 u4 slot; /* slot in hashtable */
688 utf *u; /* hashtable element */
691 LOCK_MONITOR_ENTER(hashtable_utf->header);
693 #if defined(ENABLE_STATISTICS)
698 key = utf_hashkey(text, length);
699 slot = key & (hashtable_utf->size - 1);
700 u = hashtable_utf->ptr[slot];
702 /* search external hash chain for utf-symbol */
705 if (u->blength == length) {
706 /* compare text of hashtable elements */
708 for (i = 0; i < length; i++)
709 if (text[i] != u->text[i])
712 #if defined(ENABLE_STATISTICS)
714 count_utf_new_found++;
717 /* symbol found in hashtable */
719 LOCK_MONITOR_EXIT(hashtable_utf->header);
725 u = u->hashlink; /* next element in external chain */
728 /* location in hashtable found, create new utf element */
732 u->blength = length; /* length in bytes of utfstring */
733 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
734 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
736 memcpy(u->text, text, length); /* copy utf-text */
737 u->text[length] = '\0';
739 #if defined(ENABLE_STATISTICS)
741 count_utf_len += sizeof(utf) + length + 1;
744 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
745 hashtable_utf->entries++; /* update number of entries */
747 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
749 /* reorganization of hashtable, average length of the external
750 chains is approx. 2 */
752 hashtable *newhash; /* the new hashtable */
758 /* create new hashtable, double the size */
760 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
762 #if defined(ENABLE_STATISTICS)
764 count_utf_len += sizeof(utf*) * hashtable_utf->size;
767 /* transfer elements to new hashtable */
769 for (i = 0; i < hashtable_utf->size; i++) {
770 u = hashtable_utf->ptr[i];
774 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
776 u->hashlink = (utf *) newhash->ptr[slot];
777 newhash->ptr[slot] = u;
779 /* follow link in external hash chain */
785 /* dispose old table */
787 hashtable_free(hashtable_utf);
789 hashtable_utf = newhash;
792 LOCK_MONITOR_EXIT(hashtable_utf->header);
798 /* utf_new_u2 ******************************************************************
800 Make utf symbol from u2 array, if isclassname is true '.' is
803 *******************************************************************************/
805 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
807 char *buffer; /* memory buffer for unicode characters */
808 char *pos; /* pointer to current position in buffer */
809 u4 left; /* unicode characters left */
810 u4 buflength; /* utf length in bytes of the u2 array */
811 utf *result; /* resulting utf-string */
814 /* determine utf length in bytes and allocate memory */
816 buflength = u2_utflength(unicode_pos, unicode_length);
817 buffer = MNEW(char, buflength);
822 for (i = 0; i++ < unicode_length; unicode_pos++) {
823 /* next unicode character */
826 if ((c != 0) && (c < 0x80)) {
829 if ((int) left < 0) break;
830 /* convert classname */
831 if (isclassname && c == '.')
836 } else if (c < 0x800) {
838 unsigned char high = c >> 6;
839 unsigned char low = c & 0x3F;
841 if ((int) left < 0) break;
842 *pos++ = high | 0xC0;
848 char mid = (c >> 6) & 0x3F;
851 if ((int) left < 0) break;
852 *pos++ = high | 0xE0;
858 /* insert utf-string into symbol-table */
859 result = utf_new(buffer,buflength);
861 MFREE(buffer, char, buflength);
867 /* utf_new_char ****************************************************************
869 Creates a new utf symbol, the text for this symbol is passed as a
870 c-string ( = char* ).
872 *******************************************************************************/
874 utf *utf_new_char(const char *text)
876 return utf_new(text, strlen(text));
880 /* utf_new_char_classname ******************************************************
882 Creates a new utf symbol, the text for this symbol is passed as a
883 c-string ( = char* ) "." characters are going to be replaced by
884 "/". Since the above function is used often, this is a separte
885 function, instead of an if.
887 *******************************************************************************/
889 utf *utf_new_char_classname(const char *text)
891 if (strchr(text, '.')) {
892 char *txt = strdup(text);
893 char *end = txt + strlen(txt);
897 for (c = txt; c < end; c++)
898 if (*c == '.') *c = '/';
900 tmpRes = utf_new(txt, strlen(txt));
906 return utf_new(text, strlen(text));
910 /* utf_nextu2 ******************************************************************
912 Read the next unicode character from the utf string and increment
913 the utf-string pointer accordingly.
915 CAUTION: This function is unsafe for input that was not checked
918 *******************************************************************************/
920 u2 utf_nextu2(char **utf_ptr)
922 /* uncompressed unicode character */
924 /* current position in utf text */
925 unsigned char *utf = (unsigned char *) (*utf_ptr);
926 /* bytes representing the unicode character */
927 unsigned char ch1, ch2, ch3;
928 /* number of bytes used to represent the unicode character */
931 switch ((ch1 = utf[0]) >> 4) {
932 default: /* 1 byte */
936 case 0xD: /* 2 bytes */
937 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
938 unsigned char high = ch1 & 0x1F;
939 unsigned char low = ch2 & 0x3F;
940 unicode_char = (high << 6) + low;
945 case 0xE: /* 2 or 3 bytes */
946 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
947 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
948 unsigned char low = ch3 & 0x3f;
949 unsigned char mid = ch2 & 0x3f;
950 unsigned char high = ch1 & 0x0f;
951 unicode_char = (((high << 6) + mid) << 6) + low;
959 /* update position in utf-text */
960 *utf_ptr = (char *) (utf + len);
966 /* utf_bytes *******************************************************************
968 Determine number of bytes (aka. octets) in the utf string.
971 u............utf string
974 The number of octets of this utf string.
975 There is _no_ terminating zero included in this count.
977 *******************************************************************************/
985 /* utf_get_number_of_u2s_for_buffer ********************************************
987 Determine number of UTF-16 u2s in the given UTF-8 buffer
989 CAUTION: This function is unsafe for input that was not checked
992 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
993 to an array of u2s (UTF-16) and want to know how many of them you will get.
994 All other uses of this function are probably wrong.
997 buffer........points to first char in buffer
998 blength.......number of _bytes_ in the buffer
1001 the number of u2s needed to hold this string in UTF-16 encoding.
1002 There is _no_ terminating zero included in this count.
1004 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1007 *******************************************************************************/
1009 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1011 const char *endpos; /* points behind utf string */
1012 const char *utf_ptr; /* current position in utf text */
1013 u4 len = 0; /* number of unicode characters */
1016 endpos = utf_ptr + blength;
1018 while (utf_ptr < endpos) {
1020 /* next unicode character */
1021 utf_nextu2((char **)&utf_ptr);
1024 assert(utf_ptr == endpos);
1030 /* utf_get_number_of_u2s *******************************************************
1032 Determine number of UTF-16 u2s in the utf string.
1034 CAUTION: This function is unsafe for input that was not checked
1037 CAUTION: Use this function *only* when you want to convert a utf string
1038 to an array of u2s and want to know how many of them you will get.
1039 All other uses of this function are probably wrong.
1042 u............utf string
1045 the number of u2s needed to hold this string in UTF-16 encoding.
1046 There is _no_ terminating zero included in this count.
1047 XXX 0 if a NullPointerException has been thrown (see below)
1049 *******************************************************************************/
1051 u4 utf_get_number_of_u2s(utf *u)
1053 char *endpos; /* points behind utf string */
1054 char *utf_ptr; /* current position in utf text */
1055 u4 len = 0; /* number of unicode characters */
1057 /* XXX this is probably not checked by most callers! Review this after */
1058 /* the invalid uses of this function have been eliminated */
1060 exceptions_throw_nullpointerexception();
1064 endpos = UTF_END(u);
1067 while (utf_ptr < endpos) {
1069 /* next unicode character */
1070 utf_nextu2(&utf_ptr);
1073 if (utf_ptr != endpos) {
1074 /* string ended abruptly */
1075 exceptions_throw_internalerror("Illegal utf8 string");
1083 /* utf8_safe_number_of_u2s *****************************************************
1085 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1086 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1088 This function is safe even for invalid UTF-8 strings.
1091 text..........zero-terminated(!) UTF-8 string (may be invalid)
1093 nbytes........strlen(text). (This is needed to completely emulate
1097 the number of u2s needed to hold this string in UTF-16 encoding.
1098 There is _no_ terminating zero included in this count.
1100 *******************************************************************************/
1102 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1103 register const unsigned char *t;
1106 register const unsigned char *tlimit;
1114 assert(nbytes >= 0);
1117 t = (const unsigned char *) text;
1118 tlimit = t + nbytes;
1120 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1126 /* highest bit set, non-ASCII character */
1128 if ((byte & 0xe0) == 0xc0) {
1129 /* 2-byte: should be 110..... 10...... ? */
1131 if ((*t++ & 0xc0) == 0x80)
1132 ; /* valid 2-byte */
1136 else if ((byte & 0xf0) == 0xe0) {
1137 /* 3-byte: should be 1110.... 10...... 10...... */
1141 return len + 1; /* invalid, stop here */
1143 if ((*t++ & 0xc0) == 0x80) {
1144 if ((*t++ & 0xc0) == 0x80)
1145 ; /* valid 3-byte */
1152 else if ((byte & 0xf8) == 0xf0) {
1153 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1157 return len + 1; /* invalid, stop here */
1159 if (((byte1 = *t++) & 0xc0) == 0x80) {
1160 if (((byte2 = *t++) & 0xc0) == 0x80) {
1161 if (((byte3 = *t++) & 0xc0) == 0x80) {
1162 /* valid 4-byte UTF-8? */
1163 value = ((byte & 0x07) << 18)
1164 | ((byte1 & 0x3f) << 12)
1165 | ((byte2 & 0x3f) << 6)
1166 | ((byte3 & 0x3f) );
1168 if (value > 0x10FFFF)
1170 else if (value > 0xFFFF)
1171 len += 1; /* we need surrogates */
1173 ; /* 16bit suffice */
1184 else if ((byte & 0xfc) == 0xf8) {
1185 /* invalid 5-byte */
1187 return len + 1; /* invalid, stop here */
1190 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1193 else if ((byte & 0xfe) == 0xfc) {
1194 /* invalid 6-byte */
1196 return len + 1; /* invalid, stop here */
1199 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1211 /* ASCII character, common case */
1221 /* utf8_safe_convert_to_u2s ****************************************************
1223 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1224 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1225 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1227 This function is safe even for invalid UTF-8 strings.
1230 text..........zero-terminated(!) UTF-8 string (may be invalid)
1232 nbytes........strlen(text). (This is needed to completely emulate
1234 buffer........a preallocated array of u2s to receive the decoded
1235 string. Use utf8_safe_number_of_u2s to get the
1236 required number of u2s for allocating this.
1238 *******************************************************************************/
1240 #define UNICODE_REPLACEMENT 0xfffd
1242 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1243 register const unsigned char *t;
1245 register const unsigned char *tlimit;
1253 assert(nbytes >= 0);
1255 t = (const unsigned char *) text;
1256 tlimit = t + nbytes;
1258 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1264 /* highest bit set, non-ASCII character */
1266 if ((byte & 0xe0) == 0xc0) {
1267 /* 2-byte: should be 110..... 10...... */
1269 if (((byte1 = *t++) & 0xc0) == 0x80) {
1270 /* valid 2-byte UTF-8 */
1271 *buffer++ = ((byte & 0x1f) << 6)
1272 | ((byte1 & 0x3f) );
1275 *buffer++ = UNICODE_REPLACEMENT;
1279 else if ((byte & 0xf0) == 0xe0) {
1280 /* 3-byte: should be 1110.... 10...... 10...... */
1282 if (t + 2 > tlimit) {
1283 *buffer++ = UNICODE_REPLACEMENT;
1287 if (((byte1 = *t++) & 0xc0) == 0x80) {
1288 if (((byte2 = *t++) & 0xc0) == 0x80) {
1289 /* valid 3-byte UTF-8 */
1290 *buffer++ = ((byte & 0x0f) << 12)
1291 | ((byte1 & 0x3f) << 6)
1292 | ((byte2 & 0x3f) );
1295 *buffer++ = UNICODE_REPLACEMENT;
1300 *buffer++ = UNICODE_REPLACEMENT;
1304 else if ((byte & 0xf8) == 0xf0) {
1305 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1307 if (t + 3 > tlimit) {
1308 *buffer++ = UNICODE_REPLACEMENT;
1312 if (((byte1 = *t++) & 0xc0) == 0x80) {
1313 if (((byte2 = *t++) & 0xc0) == 0x80) {
1314 if (((byte3 = *t++) & 0xc0) == 0x80) {
1315 /* valid 4-byte UTF-8? */
1316 value = ((byte & 0x07) << 18)
1317 | ((byte1 & 0x3f) << 12)
1318 | ((byte2 & 0x3f) << 6)
1319 | ((byte3 & 0x3f) );
1321 if (value > 0x10FFFF) {
1322 *buffer++ = UNICODE_REPLACEMENT;
1324 else if (value > 0xFFFF) {
1325 /* we need surrogates */
1326 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1327 *buffer++ = 0xdc00 | (value & 0x03ff);
1330 *buffer++ = value; /* 16bit suffice */
1333 *buffer++ = UNICODE_REPLACEMENT;
1338 *buffer++ = UNICODE_REPLACEMENT;
1343 *buffer++ = UNICODE_REPLACEMENT;
1347 else if ((byte & 0xfc) == 0xf8) {
1348 if (t + 4 > tlimit) {
1349 *buffer++ = UNICODE_REPLACEMENT;
1354 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1356 *buffer++ = UNICODE_REPLACEMENT;
1358 else if ((byte & 0xfe) == 0xfc) {
1359 if (t + 5 > tlimit) {
1360 *buffer++ = UNICODE_REPLACEMENT;
1365 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1367 *buffer++ = UNICODE_REPLACEMENT;
1370 *buffer++ = UNICODE_REPLACEMENT;
1378 /* ASCII character, common case */
1386 /* u2_utflength ****************************************************************
1388 Returns the utf length in bytes of a u2 array.
1390 *******************************************************************************/
1392 u4 u2_utflength(u2 *text, u4 u2_length)
1394 u4 result_len = 0; /* utf length in bytes */
1395 u2 ch; /* current unicode character */
1398 for (len = 0; len < u2_length; len++) {
1399 /* next unicode character */
1402 /* determine bytes required to store unicode character as utf */
1403 if (ch && (ch < 0x80))
1405 else if (ch < 0x800)
1415 /* utf_copy ********************************************************************
1417 Copy the given utf string byte-for-byte to a buffer.
1420 buffer.......the buffer
1421 u............the utf string
1423 *******************************************************************************/
1425 void utf_copy(char *buffer, utf *u)
1427 /* our utf strings are zero-terminated (done by utf_new) */
1428 MCOPY(buffer, u->text, char, u->blength + 1);
1432 /* utf_cat *********************************************************************
1434 Append the given utf string byte-for-byte to a buffer.
1437 buffer.......the buffer
1438 u............the utf string
1440 *******************************************************************************/
1442 void utf_cat(char *buffer, utf *u)
1444 /* our utf strings are zero-terminated (done by utf_new) */
1445 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1449 /* utf_copy_classname **********************************************************
1451 Copy the given utf classname byte-for-byte to a buffer.
1452 '/' is replaced by '.'
1455 buffer.......the buffer
1456 u............the utf string
1458 *******************************************************************************/
1460 void utf_copy_classname(char *buffer, utf *u)
1469 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1471 while (srcptr != endptr) {
1480 /* utf_cat *********************************************************************
1482 Append the given utf classname byte-for-byte to a buffer.
1483 '/' is replaced by '.'
1486 buffer.......the buffer
1487 u............the utf string
1489 *******************************************************************************/
1491 void utf_cat_classname(char *buffer, utf *u)
1493 utf_copy_classname(buffer + strlen(buffer), u);
1496 /* utf_display_printable_ascii *************************************************
1498 Write utf symbol to stdout (for debugging purposes).
1499 Non-printable and non-ASCII characters are printed as '?'.
1501 *******************************************************************************/
1503 void utf_display_printable_ascii(utf *u)
1505 char *endpos; /* points behind utf string */
1506 char *utf_ptr; /* current position in utf text */
1514 endpos = UTF_END(u);
1517 while (utf_ptr < endpos) {
1518 /* read next unicode character */
1520 u2 c = utf_nextu2(&utf_ptr);
1522 if ((c >= 32) && (c <= 127))
1532 /* utf_display_printable_ascii_classname ***************************************
1534 Write utf symbol to stdout with `/' converted to `.' (for debugging
1536 Non-printable and non-ASCII characters are printed as '?'.
1538 *******************************************************************************/
1540 void utf_display_printable_ascii_classname(utf *u)
1542 char *endpos; /* points behind utf string */
1543 char *utf_ptr; /* current position in utf text */
1551 endpos = UTF_END(u);
1554 while (utf_ptr < endpos) {
1555 /* read next unicode character */
1557 u2 c = utf_nextu2(&utf_ptr);
1562 if ((c >= 32) && (c <= 127))
1572 /* utf_sprint_convert_to_latin1 ************************************************
1574 Write utf symbol into c-string (for debugging purposes).
1575 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1578 *******************************************************************************/
1580 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1582 char *endpos; /* points behind utf string */
1583 char *utf_ptr; /* current position in utf text */
1584 u2 pos = 0; /* position in c-string */
1587 strcpy(buffer, "NULL");
1591 endpos = UTF_END(u);
1594 while (utf_ptr < endpos)
1595 /* copy next unicode character */
1596 buffer[pos++] = utf_nextu2(&utf_ptr);
1598 /* terminate string */
1603 /* utf_sprint_convert_to_latin1_classname **************************************
1605 Write utf symbol into c-string with `/' converted to `.' (for debugging
1607 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1610 *******************************************************************************/
1612 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1614 char *endpos; /* points behind utf string */
1615 char *utf_ptr; /* current position in utf text */
1616 u2 pos = 0; /* position in c-string */
1619 strcpy(buffer, "NULL");
1623 endpos = UTF_END(u);
1626 while (utf_ptr < endpos) {
1627 /* copy next unicode character */
1628 u2 c = utf_nextu2(&utf_ptr);
1629 if (c == '/') c = '.';
1633 /* terminate string */
1638 /* utf_strcat_convert_to_latin1 ************************************************
1640 Like libc strcat, but uses an utf8 string.
1641 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1644 *******************************************************************************/
1646 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1648 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1652 /* utf_strcat_convert_to_latin1_classname **************************************
1654 Like libc strcat, but uses an utf8 string.
1655 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1658 *******************************************************************************/
1660 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1662 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1666 /* utf_fprint_printable_ascii **************************************************
1668 Write utf symbol into file.
1669 Non-printable and non-ASCII characters are printed as '?'.
1671 *******************************************************************************/
1673 void utf_fprint_printable_ascii(FILE *file, utf *u)
1675 char *endpos; /* points behind utf string */
1676 char *utf_ptr; /* current position in utf text */
1681 endpos = UTF_END(u);
1684 while (utf_ptr < endpos) {
1685 /* read next unicode character */
1686 u2 c = utf_nextu2(&utf_ptr);
1688 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1689 else fprintf(file, "?");
1694 /* utf_fprint_printable_ascii_classname ****************************************
1696 Write utf symbol into file with `/' converted to `.'.
1697 Non-printable and non-ASCII characters are printed as '?'.
1699 *******************************************************************************/
1701 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1703 char *endpos; /* points behind utf string */
1704 char *utf_ptr; /* current position in utf text */
1709 endpos = UTF_END(u);
1712 while (utf_ptr < endpos) {
1713 /* read next unicode character */
1714 u2 c = utf_nextu2(&utf_ptr);
1715 if (c == '/') c = '.';
1717 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1718 else fprintf(file, "?");
1723 /* is_valid_utf ****************************************************************
1725 Return true if the given string is a valid UTF-8 string.
1727 utf_ptr...points to first character
1728 end_pos...points after last character
1730 *******************************************************************************/
1732 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1734 bool is_valid_utf(char *utf_ptr, char *end_pos)
1741 if (end_pos < utf_ptr) return false;
1742 bytes = end_pos - utf_ptr;
1746 if (!c) return false; /* 0x00 is not allowed */
1747 if ((c & 0x80) == 0) continue; /* ASCII */
1749 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1750 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1751 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1752 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1753 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1754 else return false; /* invalid leading byte */
1756 if (len > 2) return false; /* Java limitation */
1758 v = (unsigned long)c & (0x3f >> len);
1760 if ((bytes -= len) < 0) return false; /* missing bytes */
1762 for (i = len; i--; ) {
1764 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1766 v = (v << 6) | (c & 0x3f);
1770 if (len != 1) return false; /* Java special */
1773 /* Sun Java seems to allow overlong UTF-8 encodings */
1775 /* if (v < min_codepoint[len]) */
1776 /* XXX throw exception? */
1779 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1780 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1782 /* even these seem to be allowed */
1783 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1790 /* is_valid_name ***************************************************************
1792 Return true if the given string may be used as a class/field/method
1793 name. (Currently this only disallows empty strings and control
1796 NOTE: The string is assumed to have passed is_valid_utf!
1798 utf_ptr...points to first character
1799 end_pos...points after last character
1801 *******************************************************************************/
1803 bool is_valid_name(char *utf_ptr, char *end_pos)
1805 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1807 while (utf_ptr < end_pos) {
1808 unsigned char c = *utf_ptr++;
1810 if (c < 0x20) return false; /* disallow control characters */
1811 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1818 bool is_valid_name_utf(utf *u)
1820 return is_valid_name(u->text, UTF_END(u));
1824 /* utf_show ********************************************************************
1826 Writes the utf symbols in the utfhash to stdout and displays the
1827 number of external hash chains grouped according to the chainlength
1828 (for debugging purposes).
1830 *******************************************************************************/
1832 #if !defined(NDEBUG)
1836 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1838 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1839 u4 max_chainlength = 0; /* maximum length of the chains */
1840 u4 sum_chainlength = 0; /* sum of the chainlengths */
1841 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1844 printf("UTF-HASH:\n");
1846 /* show element of utf-hashtable */
1848 for (i = 0; i < hashtable_utf->size; i++) {
1849 utf *u = hashtable_utf->ptr[i];
1852 printf("SLOT %d: ", (int) i);
1856 utf_display_printable_ascii(u);
1864 printf("UTF-HASH: %d slots for %d entries\n",
1865 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1867 if (hashtable_utf->entries == 0)
1870 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1872 for (i=0;i<CHAIN_LIMIT;i++)
1875 /* count numbers of hashchains according to their length */
1876 for (i=0; i<hashtable_utf->size; i++) {
1878 utf *u = (utf*) hashtable_utf->ptr[i];
1879 u4 chain_length = 0;
1881 /* determine chainlength */
1887 /* update sum of all chainlengths */
1888 sum_chainlength+=chain_length;
1890 /* determine the maximum length of the chains */
1891 if (chain_length>max_chainlength)
1892 max_chainlength = chain_length;
1894 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1895 if (chain_length>=CHAIN_LIMIT) {
1896 beyond_limit+=chain_length;
1897 chain_length=CHAIN_LIMIT-1;
1900 /* update number of hashchains of current length */
1901 chain_count[chain_length]++;
1904 /* display results */
1905 for (i=1;i<CHAIN_LIMIT-1;i++)
1906 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1908 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1911 printf("max. chainlength:%5d\n",max_chainlength);
1913 /* avg. chainlength = sum of chainlengths / number of chains */
1914 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1916 #endif /* !defined(NDEBUG) */
1920 * These are local overrides for various environment variables in Emacs.
1921 * Please do not remove this and leave it at the end of the file, where
1922 * Emacs will automagically detect them.
1923 * ---------------------------------------------------------------------
1926 * indent-tabs-mode: t
1930 * vim:noexpandtab:sw=4:ts=4: