1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
35 #include "mm/memory.h"
37 #include "threads/lock-common.h"
39 #include "toolbox/hashtable.h"
41 #include "vm/exceptions.h"
43 #include "vmcore/options.h"
45 #if defined(ENABLE_STATISTICS)
46 # include "vmcore/statistics.h"
49 #include "vmcore/utf8.h"
52 /* global variables ***********************************************************/
54 /* hashsize must be power of 2 */
56 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
58 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
61 /* utf-symbols for pointer comparison of frequently used strings **************/
63 utf *utf_java_lang_Object;
65 utf *utf_java_lang_Class;
66 utf *utf_java_lang_ClassLoader;
67 utf *utf_java_lang_Cloneable;
68 utf *utf_java_lang_SecurityManager;
69 utf *utf_java_lang_String;
70 utf *utf_java_lang_System;
71 utf *utf_java_lang_ThreadGroup;
72 utf *utf_java_lang_ref_SoftReference;
73 utf *utf_java_lang_ref_WeakReference;
74 utf *utf_java_lang_ref_PhantomReference;
75 utf *utf_java_io_Serializable;
77 utf *utf_java_lang_Throwable;
78 utf *utf_java_lang_Error;
80 utf *utf_java_lang_AbstractMethodError;
81 utf *utf_java_lang_ClassCircularityError;
82 utf *utf_java_lang_ClassFormatError;
83 utf *utf_java_lang_ExceptionInInitializerError;
84 utf *utf_java_lang_IncompatibleClassChangeError;
85 utf *utf_java_lang_InstantiationError;
86 utf *utf_java_lang_InternalError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoClassDefFoundError;
89 utf *utf_java_lang_NoSuchFieldError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
92 utf *utf_java_lang_UnsatisfiedLinkError;
93 utf *utf_java_lang_UnsupportedClassVersionError;
94 utf *utf_java_lang_VerifyError;
95 utf *utf_java_lang_VirtualMachineError;
97 #if defined(WITH_CLASSPATH_GNU)
98 utf *utf_java_lang_VMThrowable;
101 utf *utf_java_lang_Exception;
103 utf *utf_java_lang_ArithmeticException;
104 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
105 utf *utf_java_lang_ArrayStoreException;
106 utf *utf_java_lang_ClassCastException;
107 utf *utf_java_lang_ClassNotFoundException;
108 utf *utf_java_lang_CloneNotSupportedException;
109 utf *utf_java_lang_IllegalAccessException;
110 utf *utf_java_lang_IllegalArgumentException;
111 utf *utf_java_lang_IllegalMonitorStateException;
112 utf *utf_java_lang_InstantiationException;
113 utf *utf_java_lang_InterruptedException;
114 utf *utf_java_lang_NegativeArraySizeException;
115 utf *utf_java_lang_NullPointerException;
116 utf *utf_java_lang_StringIndexOutOfBoundsException;
118 utf *utf_java_lang_reflect_InvocationTargetException;
120 utf *utf_java_security_PrivilegedActionException;
122 #if defined(ENABLE_JAVASE)
123 utf* utf_java_lang_Void;
126 utf* utf_java_lang_Boolean;
127 utf* utf_java_lang_Byte;
128 utf* utf_java_lang_Character;
129 utf* utf_java_lang_Short;
130 utf* utf_java_lang_Integer;
131 utf* utf_java_lang_Long;
132 utf* utf_java_lang_Float;
133 utf* utf_java_lang_Double;
135 #if defined(ENABLE_JAVASE)
136 utf *utf_java_lang_StackTraceElement;
137 utf *utf_java_lang_reflect_Constructor;
138 utf *utf_java_lang_reflect_Field;
139 utf *utf_java_lang_reflect_Method;
140 utf *utf_java_util_Vector;
143 utf *utf_InnerClasses; /* InnerClasses */
144 utf *utf_ConstantValue; /* ConstantValue */
145 utf *utf_Code; /* Code */
146 utf *utf_Exceptions; /* Exceptions */
147 utf *utf_LineNumberTable; /* LineNumberTable */
148 utf *utf_SourceFile; /* SourceFile */
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
153 utf *utf_StackMapTable;
155 #if defined(ENABLE_ANNOTATIONS)
156 utf *utf_sun_reflect_ConstantPool;
157 #if defined(WITH_CLASSPATH_GNU)
158 utf *utf_sun_reflect_annotation_AnnotationParser;
161 utf *utf_RuntimeVisibleAnnotations;
162 utf *utf_RuntimeInvisibleAnnotations;
163 utf *utf_RuntimeVisibleParameterAnnotations;
164 utf *utf_RuntimeInvisibleParameterAnnotations;
165 utf *utf_AnnotationDefault;
169 utf *utf_init; /* <init> */
170 utf *utf_clinit; /* <clinit> */
171 utf *utf_clone; /* clone */
172 utf *utf_finalize; /* finalize */
173 utf *utf_run; /* run */
178 utf *utf_removeThread;
181 utf *utf_uncaughtException;
184 utf *utf_fillInStackTrace;
186 utf *utf_getSystemClassLoader;
189 utf *utf_printStackTrace;
191 utf *utf_division_by_zero;
202 utf *utf_void__void; /* ()V */
203 utf *utf_boolean__void; /* (Z)V */
204 utf *utf_byte__void; /* (B)V */
205 utf *utf_char__void; /* (C)V */
206 utf *utf_short__void; /* (S)V */
207 utf *utf_int__void; /* (I)V */
208 utf *utf_long__void; /* (J)V */
209 utf *utf_float__void; /* (F)V */
210 utf *utf_double__void; /* (D)V */
212 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
213 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
214 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
215 utf *utf_java_lang_ClassLoader_java_lang_String__J;
216 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
217 utf *utf_java_lang_Object__java_lang_Object;
218 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
219 utf *utf_java_lang_String__java_lang_Class;
220 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
221 utf *utf_java_lang_Thread_java_lang_Throwable__V;
222 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
223 utf *utf_java_lang_Throwable__java_lang_Throwable;
225 utf *utf_not_named_yet; /* special name for unnamed classes */
227 utf *array_packagename;
230 /* utf_init ********************************************************************
232 Initializes the utf8 subsystem.
234 *******************************************************************************/
238 /* create utf8 hashtable */
240 hashtable_utf = NEW(hashtable);
242 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
244 #if defined(ENABLE_STATISTICS)
246 count_utf_len += sizeof(utf*) * hashtable_utf->size;
249 /* create utf-symbols for pointer comparison of frequently used strings */
251 utf_java_lang_Object = utf_new_char("java/lang/Object");
253 utf_java_lang_Class = utf_new_char("java/lang/Class");
254 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
255 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
256 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
257 utf_java_lang_String = utf_new_char("java/lang/String");
258 utf_java_lang_System = utf_new_char("java/lang/System");
259 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
261 utf_java_lang_ref_SoftReference =
262 utf_new_char("java/lang/ref/SoftReference");
264 utf_java_lang_ref_WeakReference =
265 utf_new_char("java/lang/ref/WeakReference");
267 utf_java_lang_ref_PhantomReference =
268 utf_new_char("java/lang/ref/PhantomReference");
270 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
272 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
273 utf_java_lang_Error = utf_new_char("java/lang/Error");
275 utf_java_lang_ClassCircularityError =
276 utf_new_char("java/lang/ClassCircularityError");
278 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
280 utf_java_lang_ExceptionInInitializerError =
281 utf_new_char("java/lang/ExceptionInInitializerError");
283 utf_java_lang_IncompatibleClassChangeError =
284 utf_new_char("java/lang/IncompatibleClassChangeError");
286 utf_java_lang_InstantiationError =
287 utf_new_char("java/lang/InstantiationError");
289 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
290 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
292 utf_java_lang_NoClassDefFoundError =
293 utf_new_char("java/lang/NoClassDefFoundError");
295 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
297 utf_java_lang_UnsatisfiedLinkError =
298 utf_new_char("java/lang/UnsatisfiedLinkError");
300 utf_java_lang_UnsupportedClassVersionError =
301 utf_new_char("java/lang/UnsupportedClassVersionError");
303 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
305 utf_java_lang_VirtualMachineError =
306 utf_new_char("java/lang/VirtualMachineError");
308 #if defined(ENABLE_JAVASE)
309 utf_java_lang_AbstractMethodError =
310 utf_new_char("java/lang/AbstractMethodError");
312 utf_java_lang_NoSuchFieldError =
313 utf_new_char("java/lang/NoSuchFieldError");
315 utf_java_lang_NoSuchMethodError =
316 utf_new_char("java/lang/NoSuchMethodError");
319 #if defined(WITH_CLASSPATH_GNU)
320 utf_java_lang_VMThrowable = utf_new_char("java/lang/VMThrowable");
323 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
325 utf_java_lang_ArithmeticException =
326 utf_new_char("java/lang/ArithmeticException");
328 utf_java_lang_ArrayIndexOutOfBoundsException =
329 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
331 utf_java_lang_ArrayStoreException =
332 utf_new_char("java/lang/ArrayStoreException");
334 utf_java_lang_ClassCastException =
335 utf_new_char("java/lang/ClassCastException");
337 utf_java_lang_ClassNotFoundException =
338 utf_new_char("java/lang/ClassNotFoundException");
340 utf_java_lang_CloneNotSupportedException =
341 utf_new_char("java/lang/CloneNotSupportedException");
343 utf_java_lang_IllegalAccessException =
344 utf_new_char("java/lang/IllegalAccessException");
346 utf_java_lang_IllegalArgumentException =
347 utf_new_char("java/lang/IllegalArgumentException");
349 utf_java_lang_IllegalMonitorStateException =
350 utf_new_char("java/lang/IllegalMonitorStateException");
352 utf_java_lang_InstantiationException =
353 utf_new_char("java/lang/InstantiationException");
355 utf_java_lang_InterruptedException =
356 utf_new_char("java/lang/InterruptedException");
358 utf_java_lang_NegativeArraySizeException =
359 utf_new_char("java/lang/NegativeArraySizeException");
361 utf_java_lang_NullPointerException =
362 utf_new_char("java/lang/NullPointerException");
364 utf_java_lang_StringIndexOutOfBoundsException =
365 utf_new_char("java/lang/StringIndexOutOfBoundsException");
367 utf_java_lang_reflect_InvocationTargetException =
368 utf_new_char("java/lang/reflect/InvocationTargetException");
370 utf_java_security_PrivilegedActionException =
371 utf_new_char("java/security/PrivilegedActionException");
373 #if defined(ENABLE_JAVASE)
374 utf_java_lang_Void = utf_new_char("java/lang/Void");
377 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
378 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
379 utf_java_lang_Character = utf_new_char("java/lang/Character");
380 utf_java_lang_Short = utf_new_char("java/lang/Short");
381 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
382 utf_java_lang_Long = utf_new_char("java/lang/Long");
383 utf_java_lang_Float = utf_new_char("java/lang/Float");
384 utf_java_lang_Double = utf_new_char("java/lang/Double");
386 #if defined(ENABLE_JAVASE)
387 utf_java_lang_StackTraceElement =
388 utf_new_char("java/lang/StackTraceElement");
390 utf_java_lang_reflect_Constructor =
391 utf_new_char("java/lang/reflect/Constructor");
393 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
394 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
395 utf_java_util_Vector = utf_new_char("java/util/Vector");
398 utf_InnerClasses = utf_new_char("InnerClasses");
399 utf_ConstantValue = utf_new_char("ConstantValue");
400 utf_Code = utf_new_char("Code");
401 utf_Exceptions = utf_new_char("Exceptions");
402 utf_LineNumberTable = utf_new_char("LineNumberTable");
403 utf_SourceFile = utf_new_char("SourceFile");
405 #if defined(ENABLE_JAVASE)
406 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
407 utf_Signature = utf_new_char("Signature");
408 utf_StackMapTable = utf_new_char("StackMapTable");
410 #if defined(ENABLE_ANNOTATIONS)
411 utf_sun_reflect_ConstantPool = utf_new_char("sun/reflect/ConstantPool");
412 #if defined(WITH_CLASSPATH_GNU)
413 utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
416 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
417 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
418 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
419 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
420 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
424 utf_init = utf_new_char("<init>");
425 utf_clinit = utf_new_char("<clinit>");
426 utf_clone = utf_new_char("clone");
427 utf_finalize = utf_new_char("finalize");
428 utf_run = utf_new_char("run");
430 utf_add = utf_new_char("add");
431 utf_remove = utf_new_char("remove");
432 utf_addThread = utf_new_char("addThread");
433 utf_removeThread = utf_new_char("removeThread");
434 utf_put = utf_new_char("put");
435 utf_get = utf_new_char("get");
436 utf_uncaughtException = utf_new_char("uncaughtException");
437 utf_value = utf_new_char("value");
439 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
440 utf_findNative = utf_new_char("findNative");
441 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
442 utf_initCause = utf_new_char("initCause");
443 utf_loadClass = utf_new_char("loadClass");
444 utf_printStackTrace = utf_new_char("printStackTrace");
446 utf_division_by_zero = utf_new_char("/ by zero");
448 utf_Z = utf_new_char("Z");
449 utf_B = utf_new_char("B");
450 utf_C = utf_new_char("C");
451 utf_S = utf_new_char("S");
452 utf_I = utf_new_char("I");
453 utf_J = utf_new_char("J");
454 utf_F = utf_new_char("F");
455 utf_D = utf_new_char("D");
457 utf_void__void = utf_new_char("()V");
458 utf_boolean__void = utf_new_char("(Z)V");
459 utf_byte__void = utf_new_char("(B)V");
460 utf_char__void = utf_new_char("(C)V");
461 utf_short__void = utf_new_char("(S)V");
462 utf_int__void = utf_new_char("(I)V");
463 utf_long__void = utf_new_char("(J)V");
464 utf_float__void = utf_new_char("(F)V");
465 utf_double__void = utf_new_char("(D)V");
466 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
467 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
469 utf_void__java_lang_ClassLoader =
470 utf_new_char("()Ljava/lang/ClassLoader;");
472 utf_java_lang_ClassLoader_java_lang_String__J =
473 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
475 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
477 utf_java_lang_Object__java_lang_Object =
478 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
480 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
482 utf_java_lang_String__java_lang_Class =
483 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
485 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
487 utf_java_lang_Thread_java_lang_Throwable__V =
488 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
490 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
492 utf_java_lang_Throwable__java_lang_Throwable =
493 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
495 utf_null = utf_new_char("null");
496 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
497 array_packagename = utf_new_char("\t<the array package>");
499 /* everything's ok */
505 /* utf_hashkey *****************************************************************
507 The hashkey is computed from the utf-text by using up to 8
508 characters. For utf-symbols longer than 15 characters 3 characters
509 are taken from the beginning and the end, 2 characters are taken
512 *******************************************************************************/
514 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
515 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
517 u4 utf_hashkey(const char *text, u4 length)
519 const char *start_pos = text; /* pointer to utf text */
523 case 0: /* empty string */
526 case 1: return fbs(0);
527 case 2: return fbs(0) ^ nbs(3);
528 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
529 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
530 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
531 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
532 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
533 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
540 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
549 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
558 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
570 return a ^ nbs(9) ^ nbs(10);
582 return a ^ nbs(9) ^ nbs(10);
593 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
604 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
606 default: /* 3 characters from beginning */
612 /* 2 characters from middle */
613 text = start_pos + (length / 2);
618 /* 3 characters from end */
619 text = start_pos + length - 4;
624 return a ^ nbs(10) ^ nbs(11);
628 /* utf_full_hashkey ************************************************************
630 This function computes a hash value using all bytes in the string.
632 The algorithm is the "One-at-a-time" algorithm as published
633 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
635 *******************************************************************************/
637 u4 utf_full_hashkey(const char *text, u4 length)
639 register const unsigned char *p = (const unsigned char *) text;
647 hash += (hash << 10);
651 hash ^= (hash >> 11);
652 hash += (hash << 15);
657 /* unicode_hashkey *************************************************************
659 Compute the hashkey of a unicode string.
661 *******************************************************************************/
663 u4 unicode_hashkey(u2 *text, u2 len)
665 return utf_hashkey((char *) text, len);
669 /* utf_new *********************************************************************
671 Creates a new utf-symbol, the text of the symbol is passed as a
672 u1-array. The function searches the utf-hashtable for a utf-symbol
673 with this text. On success the element returned, otherwise a new
674 hashtable element is created.
676 If the number of entries in the hashtable exceeds twice the size of
677 the hashtable slots a reorganization of the hashtable is done and
678 the utf symbols are copied to a new hashtable with doubled size.
680 *******************************************************************************/
682 utf *utf_new(const char *text, u2 length)
684 u4 key; /* hashkey computed from utf-text */
685 u4 slot; /* slot in hashtable */
686 utf *u; /* hashtable element */
689 LOCK_MONITOR_ENTER(hashtable_utf->header);
691 #if defined(ENABLE_STATISTICS)
696 key = utf_hashkey(text, length);
697 slot = key & (hashtable_utf->size - 1);
698 u = hashtable_utf->ptr[slot];
700 /* search external hash chain for utf-symbol */
703 if (u->blength == length) {
704 /* compare text of hashtable elements */
706 for (i = 0; i < length; i++)
707 if (text[i] != u->text[i])
710 #if defined(ENABLE_STATISTICS)
712 count_utf_new_found++;
715 /* symbol found in hashtable */
717 LOCK_MONITOR_EXIT(hashtable_utf->header);
723 u = u->hashlink; /* next element in external chain */
726 /* location in hashtable found, create new utf element */
730 u->blength = length; /* length in bytes of utfstring */
731 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
732 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
734 memcpy(u->text, text, length); /* copy utf-text */
735 u->text[length] = '\0';
737 #if defined(ENABLE_STATISTICS)
739 count_utf_len += sizeof(utf) + length + 1;
742 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
743 hashtable_utf->entries++; /* update number of entries */
745 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
747 /* reorganization of hashtable, average length of the external
748 chains is approx. 2 */
750 hashtable *newhash; /* the new hashtable */
756 /* create new hashtable, double the size */
758 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
760 #if defined(ENABLE_STATISTICS)
762 count_utf_len += sizeof(utf*) * hashtable_utf->size;
765 /* transfer elements to new hashtable */
767 for (i = 0; i < hashtable_utf->size; i++) {
768 u = hashtable_utf->ptr[i];
772 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
774 u->hashlink = (utf *) newhash->ptr[slot];
775 newhash->ptr[slot] = u;
777 /* follow link in external hash chain */
783 /* dispose old table */
785 hashtable_free(hashtable_utf);
787 hashtable_utf = newhash;
790 LOCK_MONITOR_EXIT(hashtable_utf->header);
796 /* utf_new_u2 ******************************************************************
798 Make utf symbol from u2 array, if isclassname is true '.' is
801 *******************************************************************************/
803 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
805 char *buffer; /* memory buffer for unicode characters */
806 char *pos; /* pointer to current position in buffer */
807 u4 left; /* unicode characters left */
808 u4 buflength; /* utf length in bytes of the u2 array */
809 utf *result; /* resulting utf-string */
812 /* determine utf length in bytes and allocate memory */
814 buflength = u2_utflength(unicode_pos, unicode_length);
815 buffer = MNEW(char, buflength);
820 for (i = 0; i++ < unicode_length; unicode_pos++) {
821 /* next unicode character */
824 if ((c != 0) && (c < 0x80)) {
827 if ((int) left < 0) break;
828 /* convert classname */
829 if (isclassname && c == '.')
834 } else if (c < 0x800) {
836 unsigned char high = c >> 6;
837 unsigned char low = c & 0x3F;
839 if ((int) left < 0) break;
840 *pos++ = high | 0xC0;
846 char mid = (c >> 6) & 0x3F;
849 if ((int) left < 0) break;
850 *pos++ = high | 0xE0;
856 /* insert utf-string into symbol-table */
857 result = utf_new(buffer,buflength);
859 MFREE(buffer, char, buflength);
865 /* utf_new_char ****************************************************************
867 Creates a new utf symbol, the text for this symbol is passed as a
868 c-string ( = char* ).
870 *******************************************************************************/
872 utf *utf_new_char(const char *text)
874 return utf_new(text, strlen(text));
878 /* utf_new_char_classname ******************************************************
880 Creates a new utf symbol, the text for this symbol is passed as a
881 c-string ( = char* ) "." characters are going to be replaced by
882 "/". Since the above function is used often, this is a separte
883 function, instead of an if.
885 *******************************************************************************/
887 utf *utf_new_char_classname(const char *text)
889 if (strchr(text, '.')) {
890 char *txt = strdup(text);
891 char *end = txt + strlen(txt);
895 for (c = txt; c < end; c++)
896 if (*c == '.') *c = '/';
898 tmpRes = utf_new(txt, strlen(txt));
904 return utf_new(text, strlen(text));
908 /* utf_nextu2 ******************************************************************
910 Read the next unicode character from the utf string and increment
911 the utf-string pointer accordingly.
913 CAUTION: This function is unsafe for input that was not checked
916 *******************************************************************************/
918 u2 utf_nextu2(char **utf_ptr)
920 /* uncompressed unicode character */
922 /* current position in utf text */
923 unsigned char *utf = (unsigned char *) (*utf_ptr);
924 /* bytes representing the unicode character */
925 unsigned char ch1, ch2, ch3;
926 /* number of bytes used to represent the unicode character */
929 switch ((ch1 = utf[0]) >> 4) {
930 default: /* 1 byte */
934 case 0xD: /* 2 bytes */
935 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
936 unsigned char high = ch1 & 0x1F;
937 unsigned char low = ch2 & 0x3F;
938 unicode_char = (high << 6) + low;
943 case 0xE: /* 2 or 3 bytes */
944 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
945 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
946 unsigned char low = ch3 & 0x3f;
947 unsigned char mid = ch2 & 0x3f;
948 unsigned char high = ch1 & 0x0f;
949 unicode_char = (((high << 6) + mid) << 6) + low;
957 /* update position in utf-text */
958 *utf_ptr = (char *) (utf + len);
964 /* utf_bytes *******************************************************************
966 Determine number of bytes (aka. octets) in the utf string.
969 u............utf string
972 The number of octets of this utf string.
973 There is _no_ terminating zero included in this count.
975 *******************************************************************************/
983 /* utf_get_number_of_u2s_for_buffer ********************************************
985 Determine number of UTF-16 u2s in the given UTF-8 buffer
987 CAUTION: This function is unsafe for input that was not checked
990 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
991 to an array of u2s (UTF-16) and want to know how many of them you will get.
992 All other uses of this function are probably wrong.
995 buffer........points to first char in buffer
996 blength.......number of _bytes_ in the buffer
999 the number of u2s needed to hold this string in UTF-16 encoding.
1000 There is _no_ terminating zero included in this count.
1002 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1005 *******************************************************************************/
1007 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1009 const char *endpos; /* points behind utf string */
1010 const char *utf_ptr; /* current position in utf text */
1011 u4 len = 0; /* number of unicode characters */
1014 endpos = utf_ptr + blength;
1016 while (utf_ptr < endpos) {
1018 /* next unicode character */
1019 utf_nextu2((char **)&utf_ptr);
1022 assert(utf_ptr == endpos);
1028 /* utf_get_number_of_u2s *******************************************************
1030 Determine number of UTF-16 u2s in the utf string.
1032 CAUTION: This function is unsafe for input that was not checked
1035 CAUTION: Use this function *only* when you want to convert a utf string
1036 to an array of u2s and want to know how many of them you will get.
1037 All other uses of this function are probably wrong.
1040 u............utf string
1043 the number of u2s needed to hold this string in UTF-16 encoding.
1044 There is _no_ terminating zero included in this count.
1045 XXX 0 if a NullPointerException has been thrown (see below)
1047 *******************************************************************************/
1049 u4 utf_get_number_of_u2s(utf *u)
1051 char *endpos; /* points behind utf string */
1052 char *utf_ptr; /* current position in utf text */
1053 u4 len = 0; /* number of unicode characters */
1055 /* XXX this is probably not checked by most callers! Review this after */
1056 /* the invalid uses of this function have been eliminated */
1058 exceptions_throw_nullpointerexception();
1062 endpos = UTF_END(u);
1065 while (utf_ptr < endpos) {
1067 /* next unicode character */
1068 utf_nextu2(&utf_ptr);
1071 if (utf_ptr != endpos) {
1072 /* string ended abruptly */
1073 exceptions_throw_internalerror("Illegal utf8 string");
1081 /* utf8_safe_number_of_u2s *****************************************************
1083 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1084 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1086 This function is safe even for invalid UTF-8 strings.
1089 text..........zero-terminated(!) UTF-8 string (may be invalid)
1091 nbytes........strlen(text). (This is needed to completely emulate
1095 the number of u2s needed to hold this string in UTF-16 encoding.
1096 There is _no_ terminating zero included in this count.
1098 *******************************************************************************/
1100 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1101 register const unsigned char *t;
1104 register const unsigned char *tlimit;
1112 assert(nbytes >= 0);
1115 t = (const unsigned char *) text;
1116 tlimit = t + nbytes;
1118 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1124 /* highest bit set, non-ASCII character */
1126 if ((byte & 0xe0) == 0xc0) {
1127 /* 2-byte: should be 110..... 10...... ? */
1129 if ((*t++ & 0xc0) == 0x80)
1130 ; /* valid 2-byte */
1134 else if ((byte & 0xf0) == 0xe0) {
1135 /* 3-byte: should be 1110.... 10...... 10...... */
1139 return len + 1; /* invalid, stop here */
1141 if ((*t++ & 0xc0) == 0x80) {
1142 if ((*t++ & 0xc0) == 0x80)
1143 ; /* valid 3-byte */
1150 else if ((byte & 0xf8) == 0xf0) {
1151 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1155 return len + 1; /* invalid, stop here */
1157 if (((byte1 = *t++) & 0xc0) == 0x80) {
1158 if (((byte2 = *t++) & 0xc0) == 0x80) {
1159 if (((byte3 = *t++) & 0xc0) == 0x80) {
1160 /* valid 4-byte UTF-8? */
1161 value = ((byte & 0x07) << 18)
1162 | ((byte1 & 0x3f) << 12)
1163 | ((byte2 & 0x3f) << 6)
1164 | ((byte3 & 0x3f) );
1166 if (value > 0x10FFFF)
1168 else if (value > 0xFFFF)
1169 len += 1; /* we need surrogates */
1171 ; /* 16bit suffice */
1182 else if ((byte & 0xfc) == 0xf8) {
1183 /* invalid 5-byte */
1185 return len + 1; /* invalid, stop here */
1188 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1191 else if ((byte & 0xfe) == 0xfc) {
1192 /* invalid 6-byte */
1194 return len + 1; /* invalid, stop here */
1197 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1209 /* ASCII character, common case */
1219 /* utf8_safe_convert_to_u2s ****************************************************
1221 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1222 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1223 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1225 This function is safe even for invalid UTF-8 strings.
1228 text..........zero-terminated(!) UTF-8 string (may be invalid)
1230 nbytes........strlen(text). (This is needed to completely emulate
1232 buffer........a preallocated array of u2s to receive the decoded
1233 string. Use utf8_safe_number_of_u2s to get the
1234 required number of u2s for allocating this.
1236 *******************************************************************************/
1238 #define UNICODE_REPLACEMENT 0xfffd
1240 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1241 register const unsigned char *t;
1243 register const unsigned char *tlimit;
1251 assert(nbytes >= 0);
1253 t = (const unsigned char *) text;
1254 tlimit = t + nbytes;
1256 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1262 /* highest bit set, non-ASCII character */
1264 if ((byte & 0xe0) == 0xc0) {
1265 /* 2-byte: should be 110..... 10...... */
1267 if (((byte1 = *t++) & 0xc0) == 0x80) {
1268 /* valid 2-byte UTF-8 */
1269 *buffer++ = ((byte & 0x1f) << 6)
1270 | ((byte1 & 0x3f) );
1273 *buffer++ = UNICODE_REPLACEMENT;
1277 else if ((byte & 0xf0) == 0xe0) {
1278 /* 3-byte: should be 1110.... 10...... 10...... */
1280 if (t + 2 > tlimit) {
1281 *buffer++ = UNICODE_REPLACEMENT;
1285 if (((byte1 = *t++) & 0xc0) == 0x80) {
1286 if (((byte2 = *t++) & 0xc0) == 0x80) {
1287 /* valid 3-byte UTF-8 */
1288 *buffer++ = ((byte & 0x0f) << 12)
1289 | ((byte1 & 0x3f) << 6)
1290 | ((byte2 & 0x3f) );
1293 *buffer++ = UNICODE_REPLACEMENT;
1298 *buffer++ = UNICODE_REPLACEMENT;
1302 else if ((byte & 0xf8) == 0xf0) {
1303 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1305 if (t + 3 > tlimit) {
1306 *buffer++ = UNICODE_REPLACEMENT;
1310 if (((byte1 = *t++) & 0xc0) == 0x80) {
1311 if (((byte2 = *t++) & 0xc0) == 0x80) {
1312 if (((byte3 = *t++) & 0xc0) == 0x80) {
1313 /* valid 4-byte UTF-8? */
1314 value = ((byte & 0x07) << 18)
1315 | ((byte1 & 0x3f) << 12)
1316 | ((byte2 & 0x3f) << 6)
1317 | ((byte3 & 0x3f) );
1319 if (value > 0x10FFFF) {
1320 *buffer++ = UNICODE_REPLACEMENT;
1322 else if (value > 0xFFFF) {
1323 /* we need surrogates */
1324 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1325 *buffer++ = 0xdc00 | (value & 0x03ff);
1328 *buffer++ = value; /* 16bit suffice */
1331 *buffer++ = UNICODE_REPLACEMENT;
1336 *buffer++ = UNICODE_REPLACEMENT;
1341 *buffer++ = UNICODE_REPLACEMENT;
1345 else if ((byte & 0xfc) == 0xf8) {
1346 if (t + 4 > tlimit) {
1347 *buffer++ = UNICODE_REPLACEMENT;
1352 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1354 *buffer++ = UNICODE_REPLACEMENT;
1356 else if ((byte & 0xfe) == 0xfc) {
1357 if (t + 5 > tlimit) {
1358 *buffer++ = UNICODE_REPLACEMENT;
1363 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1365 *buffer++ = UNICODE_REPLACEMENT;
1368 *buffer++ = UNICODE_REPLACEMENT;
1376 /* ASCII character, common case */
1384 /* u2_utflength ****************************************************************
1386 Returns the utf length in bytes of a u2 array.
1388 *******************************************************************************/
1390 u4 u2_utflength(u2 *text, u4 u2_length)
1392 u4 result_len = 0; /* utf length in bytes */
1393 u2 ch; /* current unicode character */
1396 for (len = 0; len < u2_length; len++) {
1397 /* next unicode character */
1400 /* determine bytes required to store unicode character as utf */
1401 if (ch && (ch < 0x80))
1403 else if (ch < 0x800)
1413 /* utf_copy ********************************************************************
1415 Copy the given utf string byte-for-byte to a buffer.
1418 buffer.......the buffer
1419 u............the utf string
1421 *******************************************************************************/
1423 void utf_copy(char *buffer, utf *u)
1425 /* our utf strings are zero-terminated (done by utf_new) */
1426 MCOPY(buffer, u->text, char, u->blength + 1);
1430 /* utf_cat *********************************************************************
1432 Append the given utf string byte-for-byte to a buffer.
1435 buffer.......the buffer
1436 u............the utf string
1438 *******************************************************************************/
1440 void utf_cat(char *buffer, utf *u)
1442 /* our utf strings are zero-terminated (done by utf_new) */
1443 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1447 /* utf_copy_classname **********************************************************
1449 Copy the given utf classname byte-for-byte to a buffer.
1450 '/' is replaced by '.'
1453 buffer.......the buffer
1454 u............the utf string
1456 *******************************************************************************/
1458 void utf_copy_classname(char *buffer, utf *u)
1467 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1469 while (srcptr != endptr) {
1478 /* utf_cat *********************************************************************
1480 Append the given utf classname byte-for-byte to a buffer.
1481 '/' is replaced by '.'
1484 buffer.......the buffer
1485 u............the utf string
1487 *******************************************************************************/
1489 void utf_cat_classname(char *buffer, utf *u)
1491 utf_copy_classname(buffer + strlen(buffer), u);
1494 /* utf_display_printable_ascii *************************************************
1496 Write utf symbol to stdout (for debugging purposes).
1497 Non-printable and non-ASCII characters are printed as '?'.
1499 *******************************************************************************/
1501 void utf_display_printable_ascii(utf *u)
1503 char *endpos; /* points behind utf string */
1504 char *utf_ptr; /* current position in utf text */
1512 endpos = UTF_END(u);
1515 while (utf_ptr < endpos) {
1516 /* read next unicode character */
1518 u2 c = utf_nextu2(&utf_ptr);
1520 if ((c >= 32) && (c <= 127))
1530 /* utf_display_printable_ascii_classname ***************************************
1532 Write utf symbol to stdout with `/' converted to `.' (for debugging
1534 Non-printable and non-ASCII characters are printed as '?'.
1536 *******************************************************************************/
1538 void utf_display_printable_ascii_classname(utf *u)
1540 char *endpos; /* points behind utf string */
1541 char *utf_ptr; /* current position in utf text */
1549 endpos = UTF_END(u);
1552 while (utf_ptr < endpos) {
1553 /* read next unicode character */
1555 u2 c = utf_nextu2(&utf_ptr);
1560 if ((c >= 32) && (c <= 127))
1570 /* utf_sprint_convert_to_latin1 ************************************************
1572 Write utf symbol into c-string (for debugging purposes).
1573 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1576 *******************************************************************************/
1578 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1580 char *endpos; /* points behind utf string */
1581 char *utf_ptr; /* current position in utf text */
1582 u2 pos = 0; /* position in c-string */
1585 strcpy(buffer, "NULL");
1589 endpos = UTF_END(u);
1592 while (utf_ptr < endpos)
1593 /* copy next unicode character */
1594 buffer[pos++] = utf_nextu2(&utf_ptr);
1596 /* terminate string */
1601 /* utf_sprint_convert_to_latin1_classname **************************************
1603 Write utf symbol into c-string with `/' converted to `.' (for debugging
1605 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1608 *******************************************************************************/
1610 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1612 char *endpos; /* points behind utf string */
1613 char *utf_ptr; /* current position in utf text */
1614 u2 pos = 0; /* position in c-string */
1617 strcpy(buffer, "NULL");
1621 endpos = UTF_END(u);
1624 while (utf_ptr < endpos) {
1625 /* copy next unicode character */
1626 u2 c = utf_nextu2(&utf_ptr);
1627 if (c == '/') c = '.';
1631 /* terminate string */
1636 /* utf_strcat_convert_to_latin1 ************************************************
1638 Like libc strcat, but uses an utf8 string.
1639 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1642 *******************************************************************************/
1644 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1646 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1650 /* utf_strcat_convert_to_latin1_classname **************************************
1652 Like libc strcat, but uses an utf8 string.
1653 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1656 *******************************************************************************/
1658 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1660 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1664 /* utf_fprint_printable_ascii **************************************************
1666 Write utf symbol into file.
1667 Non-printable and non-ASCII characters are printed as '?'.
1669 *******************************************************************************/
1671 void utf_fprint_printable_ascii(FILE *file, utf *u)
1673 char *endpos; /* points behind utf string */
1674 char *utf_ptr; /* current position in utf text */
1679 endpos = UTF_END(u);
1682 while (utf_ptr < endpos) {
1683 /* read next unicode character */
1684 u2 c = utf_nextu2(&utf_ptr);
1686 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1687 else fprintf(file, "?");
1692 /* utf_fprint_printable_ascii_classname ****************************************
1694 Write utf symbol into file with `/' converted to `.'.
1695 Non-printable and non-ASCII characters are printed as '?'.
1697 *******************************************************************************/
1699 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1701 char *endpos; /* points behind utf string */
1702 char *utf_ptr; /* current position in utf text */
1707 endpos = UTF_END(u);
1710 while (utf_ptr < endpos) {
1711 /* read next unicode character */
1712 u2 c = utf_nextu2(&utf_ptr);
1713 if (c == '/') c = '.';
1715 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1716 else fprintf(file, "?");
1721 /* is_valid_utf ****************************************************************
1723 Return true if the given string is a valid UTF-8 string.
1725 utf_ptr...points to first character
1726 end_pos...points after last character
1728 *******************************************************************************/
1730 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1732 bool is_valid_utf(char *utf_ptr, char *end_pos)
1739 if (end_pos < utf_ptr) return false;
1740 bytes = end_pos - utf_ptr;
1744 if (!c) return false; /* 0x00 is not allowed */
1745 if ((c & 0x80) == 0) continue; /* ASCII */
1747 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1748 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1749 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1750 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1751 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1752 else return false; /* invalid leading byte */
1754 if (len > 2) return false; /* Java limitation */
1756 v = (unsigned long)c & (0x3f >> len);
1758 if ((bytes -= len) < 0) return false; /* missing bytes */
1760 for (i = len; i--; ) {
1762 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1764 v = (v << 6) | (c & 0x3f);
1768 if (len != 1) return false; /* Java special */
1771 /* Sun Java seems to allow overlong UTF-8 encodings */
1773 /* if (v < min_codepoint[len]) */
1774 /* XXX throw exception? */
1777 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1778 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1780 /* even these seem to be allowed */
1781 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1788 /* is_valid_name ***************************************************************
1790 Return true if the given string may be used as a class/field/method
1791 name. (Currently this only disallows empty strings and control
1794 NOTE: The string is assumed to have passed is_valid_utf!
1796 utf_ptr...points to first character
1797 end_pos...points after last character
1799 *******************************************************************************/
1801 bool is_valid_name(char *utf_ptr, char *end_pos)
1803 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1805 while (utf_ptr < end_pos) {
1806 unsigned char c = *utf_ptr++;
1808 if (c < 0x20) return false; /* disallow control characters */
1809 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1816 bool is_valid_name_utf(utf *u)
1818 return is_valid_name(u->text, UTF_END(u));
1822 /* utf_show ********************************************************************
1824 Writes the utf symbols in the utfhash to stdout and displays the
1825 number of external hash chains grouped according to the chainlength
1826 (for debugging purposes).
1828 *******************************************************************************/
1830 #if !defined(NDEBUG)
1834 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1836 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1837 u4 max_chainlength = 0; /* maximum length of the chains */
1838 u4 sum_chainlength = 0; /* sum of the chainlengths */
1839 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1842 printf("UTF-HASH:\n");
1844 /* show element of utf-hashtable */
1846 for (i = 0; i < hashtable_utf->size; i++) {
1847 utf *u = hashtable_utf->ptr[i];
1850 printf("SLOT %d: ", (int) i);
1854 utf_display_printable_ascii(u);
1862 printf("UTF-HASH: %d slots for %d entries\n",
1863 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1865 if (hashtable_utf->entries == 0)
1868 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1870 for (i=0;i<CHAIN_LIMIT;i++)
1873 /* count numbers of hashchains according to their length */
1874 for (i=0; i<hashtable_utf->size; i++) {
1876 utf *u = (utf*) hashtable_utf->ptr[i];
1877 u4 chain_length = 0;
1879 /* determine chainlength */
1885 /* update sum of all chainlengths */
1886 sum_chainlength+=chain_length;
1888 /* determine the maximum length of the chains */
1889 if (chain_length>max_chainlength)
1890 max_chainlength = chain_length;
1892 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1893 if (chain_length>=CHAIN_LIMIT) {
1894 beyond_limit+=chain_length;
1895 chain_length=CHAIN_LIMIT-1;
1898 /* update number of hashchains of current length */
1899 chain_count[chain_length]++;
1902 /* display results */
1903 for (i=1;i<CHAIN_LIMIT-1;i++)
1904 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1906 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1909 printf("max. chainlength:%5d\n",max_chainlength);
1911 /* avg. chainlength = sum of chainlengths / number of chains */
1912 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1914 #endif /* !defined(NDEBUG) */
1918 * These are local overrides for various environment variables in Emacs.
1919 * Please do not remove this and leave it at the end of the file, where
1920 * Emacs will automagically detect them.
1921 * ---------------------------------------------------------------------
1924 * indent-tabs-mode: t
1928 * vim:noexpandtab:sw=4:ts=4: