1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
35 #include "mm/memory.h"
37 #include "threads/lock-common.h"
39 #include "toolbox/hashtable.h"
41 #include "vm/exceptions.h"
43 #include "vmcore/options.h"
45 #if defined(ENABLE_STATISTICS)
46 # include "vmcore/statistics.h"
49 #include "vmcore/utf8.h"
52 /* global variables ***********************************************************/
54 /* hashsize must be power of 2 */
56 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
58 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
61 /* utf-symbols for pointer comparison of frequently used strings **************/
63 utf *utf_java_lang_Object;
65 utf *utf_java_lang_Class;
66 utf *utf_java_lang_ClassLoader;
67 utf *utf_java_lang_Cloneable;
68 utf *utf_java_lang_SecurityManager;
69 utf *utf_java_lang_String;
70 utf *utf_java_lang_System;
71 utf *utf_java_lang_ThreadGroup;
72 utf *utf_java_lang_ref_SoftReference;
73 utf *utf_java_lang_ref_WeakReference;
74 utf *utf_java_lang_ref_PhantomReference;
75 utf *utf_java_io_Serializable;
77 utf *utf_java_lang_Throwable;
78 utf *utf_java_lang_Error;
80 utf *utf_java_lang_AbstractMethodError;
81 utf *utf_java_lang_ClassCircularityError;
82 utf *utf_java_lang_ClassFormatError;
83 utf *utf_java_lang_ExceptionInInitializerError;
84 utf *utf_java_lang_IncompatibleClassChangeError;
85 utf *utf_java_lang_InstantiationError;
86 utf *utf_java_lang_InternalError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoClassDefFoundError;
89 utf *utf_java_lang_NoSuchFieldError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
92 utf *utf_java_lang_UnsatisfiedLinkError;
93 utf *utf_java_lang_UnsupportedClassVersionError;
94 utf *utf_java_lang_VerifyError;
95 utf *utf_java_lang_VirtualMachineError;
97 utf *utf_java_lang_Exception;
99 utf *utf_java_lang_ArithmeticException;
100 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
101 utf *utf_java_lang_ArrayStoreException;
102 utf *utf_java_lang_ClassCastException;
103 utf *utf_java_lang_ClassNotFoundException;
104 utf *utf_java_lang_CloneNotSupportedException;
105 utf *utf_java_lang_IllegalAccessException;
106 utf *utf_java_lang_IllegalArgumentException;
107 utf *utf_java_lang_IllegalMonitorStateException;
108 utf *utf_java_lang_InstantiationException;
109 utf *utf_java_lang_InterruptedException;
110 utf *utf_java_lang_NegativeArraySizeException;
111 utf *utf_java_lang_NullPointerException;
112 utf *utf_java_lang_StringIndexOutOfBoundsException;
114 utf *utf_java_lang_reflect_InvocationTargetException;
116 utf *utf_java_security_PrivilegedActionException;
118 #if defined(ENABLE_JAVASE)
119 utf* utf_java_lang_Void;
122 utf* utf_java_lang_Boolean;
123 utf* utf_java_lang_Byte;
124 utf* utf_java_lang_Character;
125 utf* utf_java_lang_Short;
126 utf* utf_java_lang_Integer;
127 utf* utf_java_lang_Long;
128 utf* utf_java_lang_Float;
129 utf* utf_java_lang_Double;
131 #if defined(ENABLE_JAVASE)
132 utf *utf_java_lang_StackTraceElement;
133 utf *utf_java_lang_reflect_Constructor;
134 utf *utf_java_lang_reflect_Field;
135 utf *utf_java_lang_reflect_Method;
136 utf *utf_java_util_Vector;
139 utf *utf_InnerClasses; /* InnerClasses */
140 utf *utf_ConstantValue; /* ConstantValue */
141 utf *utf_Code; /* Code */
142 utf *utf_Exceptions; /* Exceptions */
143 utf *utf_LineNumberTable; /* LineNumberTable */
144 utf *utf_SourceFile; /* SourceFile */
146 #if defined(ENABLE_JAVASE)
147 utf *utf_EnclosingMethod;
149 utf *utf_StackMapTable;
151 #if defined(ENABLE_ANNOTATIONS)
152 utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */
153 utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */
154 utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */
155 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
156 utf *utf_AnnotationDefault; /* AnnotationDefault */
160 utf *utf_init; /* <init> */
161 utf *utf_clinit; /* <clinit> */
162 utf *utf_clone; /* clone */
163 utf *utf_finalize; /* finalize */
164 utf *utf_run; /* run */
169 utf *utf_removeThread;
172 utf *utf_uncaughtException;
175 utf *utf_fillInStackTrace;
177 utf *utf_getSystemClassLoader;
180 utf *utf_loadClassInternal;
181 utf *utf_printStackTrace;
183 utf *utf_division_by_zero;
194 utf *utf_void__void; /* ()V */
195 utf *utf_boolean__void; /* (Z)V */
196 utf *utf_byte__void; /* (B)V */
197 utf *utf_char__void; /* (C)V */
198 utf *utf_short__void; /* (S)V */
199 utf *utf_int__void; /* (I)V */
200 utf *utf_long__void; /* (J)V */
201 utf *utf_float__void; /* (F)V */
202 utf *utf_double__void; /* (D)V */
204 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
205 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
206 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
207 utf *utf_java_lang_ClassLoader_java_lang_String__J;
208 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
209 utf *utf_java_lang_Object__java_lang_Object;
210 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
211 utf *utf_java_lang_String__java_lang_Class;
212 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
213 utf *utf_java_lang_Thread_java_lang_Throwable__V;
214 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
215 utf *utf_java_lang_Throwable__java_lang_Throwable;
217 utf *utf_not_named_yet; /* special name for unnamed classes */
219 utf *array_packagename;
222 /* utf_init ********************************************************************
224 Initializes the utf8 subsystem.
226 *******************************************************************************/
230 /* create utf8 hashtable */
232 hashtable_utf = NEW(hashtable);
234 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
236 #if defined(ENABLE_STATISTICS)
238 count_utf_len += sizeof(utf*) * hashtable_utf->size;
241 /* create utf-symbols for pointer comparison of frequently used strings */
243 utf_java_lang_Object = utf_new_char("java/lang/Object");
245 utf_java_lang_Class = utf_new_char("java/lang/Class");
246 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
247 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
248 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
249 utf_java_lang_String = utf_new_char("java/lang/String");
250 utf_java_lang_System = utf_new_char("java/lang/System");
251 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
253 utf_java_lang_ref_SoftReference =
254 utf_new_char("java/lang/ref/SoftReference");
256 utf_java_lang_ref_WeakReference =
257 utf_new_char("java/lang/ref/WeakReference");
259 utf_java_lang_ref_PhantomReference =
260 utf_new_char("java/lang/ref/PhantomReference");
262 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
264 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
265 utf_java_lang_Error = utf_new_char("java/lang/Error");
267 utf_java_lang_ClassCircularityError =
268 utf_new_char("java/lang/ClassCircularityError");
270 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
272 utf_java_lang_ExceptionInInitializerError =
273 utf_new_char("java/lang/ExceptionInInitializerError");
275 utf_java_lang_IncompatibleClassChangeError =
276 utf_new_char("java/lang/IncompatibleClassChangeError");
278 utf_java_lang_InstantiationError =
279 utf_new_char("java/lang/InstantiationError");
281 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
282 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
284 utf_java_lang_NoClassDefFoundError =
285 utf_new_char("java/lang/NoClassDefFoundError");
287 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
289 utf_java_lang_UnsatisfiedLinkError =
290 utf_new_char("java/lang/UnsatisfiedLinkError");
292 utf_java_lang_UnsupportedClassVersionError =
293 utf_new_char("java/lang/UnsupportedClassVersionError");
295 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
297 utf_java_lang_VirtualMachineError =
298 utf_new_char("java/lang/VirtualMachineError");
300 #if defined(ENABLE_JAVASE)
301 utf_java_lang_AbstractMethodError =
302 utf_new_char("java/lang/AbstractMethodError");
304 utf_java_lang_NoSuchFieldError =
305 utf_new_char("java/lang/NoSuchFieldError");
307 utf_java_lang_NoSuchMethodError =
308 utf_new_char("java/lang/NoSuchMethodError");
311 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
313 utf_java_lang_ArithmeticException =
314 utf_new_char("java/lang/ArithmeticException");
316 utf_java_lang_ArrayIndexOutOfBoundsException =
317 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
319 utf_java_lang_ArrayStoreException =
320 utf_new_char("java/lang/ArrayStoreException");
322 utf_java_lang_ClassCastException =
323 utf_new_char("java/lang/ClassCastException");
325 utf_java_lang_ClassNotFoundException =
326 utf_new_char("java/lang/ClassNotFoundException");
328 utf_java_lang_CloneNotSupportedException =
329 utf_new_char("java/lang/CloneNotSupportedException");
331 utf_java_lang_IllegalAccessException =
332 utf_new_char("java/lang/IllegalAccessException");
334 utf_java_lang_IllegalArgumentException =
335 utf_new_char("java/lang/IllegalArgumentException");
337 utf_java_lang_IllegalMonitorStateException =
338 utf_new_char("java/lang/IllegalMonitorStateException");
340 utf_java_lang_InstantiationException =
341 utf_new_char("java/lang/InstantiationException");
343 utf_java_lang_InterruptedException =
344 utf_new_char("java/lang/InterruptedException");
346 utf_java_lang_NegativeArraySizeException =
347 utf_new_char("java/lang/NegativeArraySizeException");
349 utf_java_lang_NullPointerException =
350 utf_new_char("java/lang/NullPointerException");
352 utf_java_lang_StringIndexOutOfBoundsException =
353 utf_new_char("java/lang/StringIndexOutOfBoundsException");
355 utf_java_lang_reflect_InvocationTargetException =
356 utf_new_char("java/lang/reflect/InvocationTargetException");
358 utf_java_security_PrivilegedActionException =
359 utf_new_char("java/security/PrivilegedActionException");
361 #if defined(ENABLE_JAVASE)
362 utf_java_lang_Void = utf_new_char("java/lang/Void");
365 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
366 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
367 utf_java_lang_Character = utf_new_char("java/lang/Character");
368 utf_java_lang_Short = utf_new_char("java/lang/Short");
369 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
370 utf_java_lang_Long = utf_new_char("java/lang/Long");
371 utf_java_lang_Float = utf_new_char("java/lang/Float");
372 utf_java_lang_Double = utf_new_char("java/lang/Double");
374 #if defined(ENABLE_JAVASE)
375 utf_java_lang_StackTraceElement =
376 utf_new_char("java/lang/StackTraceElement");
378 utf_java_lang_reflect_Constructor =
379 utf_new_char("java/lang/reflect/Constructor");
381 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
382 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
383 utf_java_util_Vector = utf_new_char("java/util/Vector");
386 utf_InnerClasses = utf_new_char("InnerClasses");
387 utf_ConstantValue = utf_new_char("ConstantValue");
388 utf_Code = utf_new_char("Code");
389 utf_Exceptions = utf_new_char("Exceptions");
390 utf_LineNumberTable = utf_new_char("LineNumberTable");
391 utf_SourceFile = utf_new_char("SourceFile");
393 #if defined(ENABLE_JAVASE)
394 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
395 utf_Signature = utf_new_char("Signature");
396 utf_StackMapTable = utf_new_char("StackMapTable");
398 #if defined(ENABLE_ANNOTATIONS)
399 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
400 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
401 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
402 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
403 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
407 utf_init = utf_new_char("<init>");
408 utf_clinit = utf_new_char("<clinit>");
409 utf_clone = utf_new_char("clone");
410 utf_finalize = utf_new_char("finalize");
411 utf_run = utf_new_char("run");
413 utf_add = utf_new_char("add");
414 utf_remove = utf_new_char("remove");
415 utf_addThread = utf_new_char("addThread");
416 utf_removeThread = utf_new_char("removeThread");
417 utf_put = utf_new_char("put");
418 utf_get = utf_new_char("get");
419 utf_uncaughtException = utf_new_char("uncaughtException");
420 utf_value = utf_new_char("value");
422 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
423 utf_findNative = utf_new_char("findNative");
424 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
425 utf_initCause = utf_new_char("initCause");
426 utf_loadClass = utf_new_char("loadClass");
427 utf_loadClassInternal = utf_new_char("loadClassInternal");
428 utf_printStackTrace = utf_new_char("printStackTrace");
430 utf_division_by_zero = utf_new_char("/ by zero");
432 utf_Z = utf_new_char("Z");
433 utf_B = utf_new_char("B");
434 utf_C = utf_new_char("C");
435 utf_S = utf_new_char("S");
436 utf_I = utf_new_char("I");
437 utf_J = utf_new_char("J");
438 utf_F = utf_new_char("F");
439 utf_D = utf_new_char("D");
441 utf_void__void = utf_new_char("()V");
442 utf_boolean__void = utf_new_char("(Z)V");
443 utf_byte__void = utf_new_char("(B)V");
444 utf_char__void = utf_new_char("(C)V");
445 utf_short__void = utf_new_char("(S)V");
446 utf_int__void = utf_new_char("(I)V");
447 utf_long__void = utf_new_char("(J)V");
448 utf_float__void = utf_new_char("(F)V");
449 utf_double__void = utf_new_char("(D)V");
450 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
451 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
453 utf_void__java_lang_ClassLoader =
454 utf_new_char("()Ljava/lang/ClassLoader;");
456 utf_java_lang_ClassLoader_java_lang_String__J =
457 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
459 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
461 utf_java_lang_Object__java_lang_Object =
462 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
464 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
466 utf_java_lang_String__java_lang_Class =
467 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
469 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
471 utf_java_lang_Thread_java_lang_Throwable__V =
472 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
474 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
476 utf_java_lang_Throwable__java_lang_Throwable =
477 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
479 utf_null = utf_new_char("null");
480 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
481 array_packagename = utf_new_char("\t<the array package>");
483 /* everything's ok */
489 /* utf_hashkey *****************************************************************
491 The hashkey is computed from the utf-text by using up to 8
492 characters. For utf-symbols longer than 15 characters 3 characters
493 are taken from the beginning and the end, 2 characters are taken
496 *******************************************************************************/
498 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
499 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
501 u4 utf_hashkey(const char *text, u4 length)
503 const char *start_pos = text; /* pointer to utf text */
507 case 0: /* empty string */
510 case 1: return fbs(0);
511 case 2: return fbs(0) ^ nbs(3);
512 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
513 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
514 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
515 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
516 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
517 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
524 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
533 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
542 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
554 return a ^ nbs(9) ^ nbs(10);
566 return a ^ nbs(9) ^ nbs(10);
577 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
588 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
590 default: /* 3 characters from beginning */
596 /* 2 characters from middle */
597 text = start_pos + (length / 2);
602 /* 3 characters from end */
603 text = start_pos + length - 4;
608 return a ^ nbs(10) ^ nbs(11);
612 /* utf_full_hashkey ************************************************************
614 This function computes a hash value using all bytes in the string.
616 The algorithm is the "One-at-a-time" algorithm as published
617 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
619 *******************************************************************************/
621 u4 utf_full_hashkey(const char *text, u4 length)
623 register const unsigned char *p = (const unsigned char *) text;
631 hash += (hash << 10);
635 hash ^= (hash >> 11);
636 hash += (hash << 15);
641 /* unicode_hashkey *************************************************************
643 Compute the hashkey of a unicode string.
645 *******************************************************************************/
647 u4 unicode_hashkey(u2 *text, u2 len)
649 return utf_hashkey((char *) text, len);
653 /* utf_new *********************************************************************
655 Creates a new utf-symbol, the text of the symbol is passed as a
656 u1-array. The function searches the utf-hashtable for a utf-symbol
657 with this text. On success the element returned, otherwise a new
658 hashtable element is created.
660 If the number of entries in the hashtable exceeds twice the size of
661 the hashtable slots a reorganization of the hashtable is done and
662 the utf symbols are copied to a new hashtable with doubled size.
664 *******************************************************************************/
666 utf *utf_new(const char *text, u2 length)
668 u4 key; /* hashkey computed from utf-text */
669 u4 slot; /* slot in hashtable */
670 utf *u; /* hashtable element */
673 LOCK_MONITOR_ENTER(hashtable_utf->header);
675 #if defined(ENABLE_STATISTICS)
680 key = utf_hashkey(text, length);
681 slot = key & (hashtable_utf->size - 1);
682 u = hashtable_utf->ptr[slot];
684 /* search external hash chain for utf-symbol */
687 if (u->blength == length) {
688 /* compare text of hashtable elements */
690 for (i = 0; i < length; i++)
691 if (text[i] != u->text[i])
694 #if defined(ENABLE_STATISTICS)
696 count_utf_new_found++;
699 /* symbol found in hashtable */
701 LOCK_MONITOR_EXIT(hashtable_utf->header);
707 u = u->hashlink; /* next element in external chain */
710 /* location in hashtable found, create new utf element */
714 u->blength = length; /* length in bytes of utfstring */
715 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
716 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
718 memcpy(u->text, text, length); /* copy utf-text */
719 u->text[length] = '\0';
721 #if defined(ENABLE_STATISTICS)
723 count_utf_len += sizeof(utf) + length + 1;
726 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
727 hashtable_utf->entries++; /* update number of entries */
729 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
731 /* reorganization of hashtable, average length of the external
732 chains is approx. 2 */
734 hashtable *newhash; /* the new hashtable */
740 /* create new hashtable, double the size */
742 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
744 #if defined(ENABLE_STATISTICS)
746 count_utf_len += sizeof(utf*) * hashtable_utf->size;
749 /* transfer elements to new hashtable */
751 for (i = 0; i < hashtable_utf->size; i++) {
752 u = hashtable_utf->ptr[i];
756 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
758 u->hashlink = (utf *) newhash->ptr[slot];
759 newhash->ptr[slot] = u;
761 /* follow link in external hash chain */
767 /* dispose old table */
769 hashtable_free(hashtable_utf);
771 hashtable_utf = newhash;
774 LOCK_MONITOR_EXIT(hashtable_utf->header);
780 /* utf_new_u2 ******************************************************************
782 Make utf symbol from u2 array, if isclassname is true '.' is
785 *******************************************************************************/
787 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
789 char *buffer; /* memory buffer for unicode characters */
790 char *pos; /* pointer to current position in buffer */
791 u4 left; /* unicode characters left */
792 u4 buflength; /* utf length in bytes of the u2 array */
793 utf *result; /* resulting utf-string */
796 /* determine utf length in bytes and allocate memory */
798 buflength = u2_utflength(unicode_pos, unicode_length);
799 buffer = MNEW(char, buflength);
804 for (i = 0; i++ < unicode_length; unicode_pos++) {
805 /* next unicode character */
808 if ((c != 0) && (c < 0x80)) {
811 if ((int) left < 0) break;
812 /* convert classname */
813 if (isclassname && c == '.')
818 } else if (c < 0x800) {
820 unsigned char high = c >> 6;
821 unsigned char low = c & 0x3F;
823 if ((int) left < 0) break;
824 *pos++ = high | 0xC0;
830 char mid = (c >> 6) & 0x3F;
833 if ((int) left < 0) break;
834 *pos++ = high | 0xE0;
840 /* insert utf-string into symbol-table */
841 result = utf_new(buffer,buflength);
843 MFREE(buffer, char, buflength);
849 /* utf_new_char ****************************************************************
851 Creates a new utf symbol, the text for this symbol is passed as a
852 c-string ( = char* ).
854 *******************************************************************************/
856 utf *utf_new_char(const char *text)
858 return utf_new(text, strlen(text));
862 /* utf_new_char_classname ******************************************************
864 Creates a new utf symbol, the text for this symbol is passed as a
865 c-string ( = char* ) "." characters are going to be replaced by
866 "/". Since the above function is used often, this is a separte
867 function, instead of an if.
869 *******************************************************************************/
871 utf *utf_new_char_classname(const char *text)
873 if (strchr(text, '.')) {
874 char *txt = strdup(text);
875 char *end = txt + strlen(txt);
879 for (c = txt; c < end; c++)
880 if (*c == '.') *c = '/';
882 tmpRes = utf_new(txt, strlen(txt));
888 return utf_new(text, strlen(text));
892 /* utf_nextu2 ******************************************************************
894 Read the next unicode character from the utf string and increment
895 the utf-string pointer accordingly.
897 CAUTION: This function is unsafe for input that was not checked
900 *******************************************************************************/
902 u2 utf_nextu2(char **utf_ptr)
904 /* uncompressed unicode character */
906 /* current position in utf text */
907 unsigned char *utf = (unsigned char *) (*utf_ptr);
908 /* bytes representing the unicode character */
909 unsigned char ch1, ch2, ch3;
910 /* number of bytes used to represent the unicode character */
913 switch ((ch1 = utf[0]) >> 4) {
914 default: /* 1 byte */
918 case 0xD: /* 2 bytes */
919 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
920 unsigned char high = ch1 & 0x1F;
921 unsigned char low = ch2 & 0x3F;
922 unicode_char = (high << 6) + low;
927 case 0xE: /* 2 or 3 bytes */
928 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
929 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
930 unsigned char low = ch3 & 0x3f;
931 unsigned char mid = ch2 & 0x3f;
932 unsigned char high = ch1 & 0x0f;
933 unicode_char = (((high << 6) + mid) << 6) + low;
941 /* update position in utf-text */
942 *utf_ptr = (char *) (utf + len);
948 /* utf_bytes *******************************************************************
950 Determine number of bytes (aka. octets) in the utf string.
953 u............utf string
956 The number of octets of this utf string.
957 There is _no_ terminating zero included in this count.
959 *******************************************************************************/
967 /* utf_get_number_of_u2s_for_buffer ********************************************
969 Determine number of UTF-16 u2s in the given UTF-8 buffer
971 CAUTION: This function is unsafe for input that was not checked
974 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
975 to an array of u2s (UTF-16) and want to know how many of them you will get.
976 All other uses of this function are probably wrong.
979 buffer........points to first char in buffer
980 blength.......number of _bytes_ in the buffer
983 the number of u2s needed to hold this string in UTF-16 encoding.
984 There is _no_ terminating zero included in this count.
986 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
989 *******************************************************************************/
991 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
993 const char *endpos; /* points behind utf string */
994 const char *utf_ptr; /* current position in utf text */
995 u4 len = 0; /* number of unicode characters */
998 endpos = utf_ptr + blength;
1000 while (utf_ptr < endpos) {
1002 /* next unicode character */
1003 utf_nextu2((char **)&utf_ptr);
1006 assert(utf_ptr == endpos);
1012 /* utf_get_number_of_u2s *******************************************************
1014 Determine number of UTF-16 u2s in the utf string.
1016 CAUTION: This function is unsafe for input that was not checked
1019 CAUTION: Use this function *only* when you want to convert a utf string
1020 to an array of u2s and want to know how many of them you will get.
1021 All other uses of this function are probably wrong.
1024 u............utf string
1027 the number of u2s needed to hold this string in UTF-16 encoding.
1028 There is _no_ terminating zero included in this count.
1029 XXX 0 if a NullPointerException has been thrown (see below)
1031 *******************************************************************************/
1033 u4 utf_get_number_of_u2s(utf *u)
1035 char *endpos; /* points behind utf string */
1036 char *utf_ptr; /* current position in utf text */
1037 u4 len = 0; /* number of unicode characters */
1039 /* XXX this is probably not checked by most callers! Review this after */
1040 /* the invalid uses of this function have been eliminated */
1042 exceptions_throw_nullpointerexception();
1046 endpos = UTF_END(u);
1049 while (utf_ptr < endpos) {
1051 /* next unicode character */
1052 utf_nextu2(&utf_ptr);
1055 if (utf_ptr != endpos) {
1056 /* string ended abruptly */
1057 exceptions_throw_internalerror("Illegal utf8 string");
1065 /* utf8_safe_number_of_u2s *****************************************************
1067 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1068 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1070 This function is safe even for invalid UTF-8 strings.
1073 text..........zero-terminated(!) UTF-8 string (may be invalid)
1075 nbytes........strlen(text). (This is needed to completely emulate
1079 the number of u2s needed to hold this string in UTF-16 encoding.
1080 There is _no_ terminating zero included in this count.
1082 *******************************************************************************/
1084 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1085 register const unsigned char *t;
1088 register const unsigned char *tlimit;
1096 assert(nbytes >= 0);
1099 t = (const unsigned char *) text;
1100 tlimit = t + nbytes;
1102 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1108 /* highest bit set, non-ASCII character */
1110 if ((byte & 0xe0) == 0xc0) {
1111 /* 2-byte: should be 110..... 10...... ? */
1113 if ((*t++ & 0xc0) == 0x80)
1114 ; /* valid 2-byte */
1118 else if ((byte & 0xf0) == 0xe0) {
1119 /* 3-byte: should be 1110.... 10...... 10...... */
1123 return len + 1; /* invalid, stop here */
1125 if ((*t++ & 0xc0) == 0x80) {
1126 if ((*t++ & 0xc0) == 0x80)
1127 ; /* valid 3-byte */
1134 else if ((byte & 0xf8) == 0xf0) {
1135 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1139 return len + 1; /* invalid, stop here */
1141 if (((byte1 = *t++) & 0xc0) == 0x80) {
1142 if (((byte2 = *t++) & 0xc0) == 0x80) {
1143 if (((byte3 = *t++) & 0xc0) == 0x80) {
1144 /* valid 4-byte UTF-8? */
1145 value = ((byte & 0x07) << 18)
1146 | ((byte1 & 0x3f) << 12)
1147 | ((byte2 & 0x3f) << 6)
1148 | ((byte3 & 0x3f) );
1150 if (value > 0x10FFFF)
1152 else if (value > 0xFFFF)
1153 len += 1; /* we need surrogates */
1155 ; /* 16bit suffice */
1166 else if ((byte & 0xfc) == 0xf8) {
1167 /* invalid 5-byte */
1169 return len + 1; /* invalid, stop here */
1172 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1175 else if ((byte & 0xfe) == 0xfc) {
1176 /* invalid 6-byte */
1178 return len + 1; /* invalid, stop here */
1181 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1193 /* ASCII character, common case */
1203 /* utf8_safe_convert_to_u2s ****************************************************
1205 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1206 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1207 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1209 This function is safe even for invalid UTF-8 strings.
1212 text..........zero-terminated(!) UTF-8 string (may be invalid)
1214 nbytes........strlen(text). (This is needed to completely emulate
1216 buffer........a preallocated array of u2s to receive the decoded
1217 string. Use utf8_safe_number_of_u2s to get the
1218 required number of u2s for allocating this.
1220 *******************************************************************************/
1222 #define UNICODE_REPLACEMENT 0xfffd
1224 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1225 register const unsigned char *t;
1227 register const unsigned char *tlimit;
1235 assert(nbytes >= 0);
1237 t = (const unsigned char *) text;
1238 tlimit = t + nbytes;
1240 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1246 /* highest bit set, non-ASCII character */
1248 if ((byte & 0xe0) == 0xc0) {
1249 /* 2-byte: should be 110..... 10...... */
1251 if (((byte1 = *t++) & 0xc0) == 0x80) {
1252 /* valid 2-byte UTF-8 */
1253 *buffer++ = ((byte & 0x1f) << 6)
1254 | ((byte1 & 0x3f) );
1257 *buffer++ = UNICODE_REPLACEMENT;
1261 else if ((byte & 0xf0) == 0xe0) {
1262 /* 3-byte: should be 1110.... 10...... 10...... */
1264 if (t + 2 > tlimit) {
1265 *buffer++ = UNICODE_REPLACEMENT;
1269 if (((byte1 = *t++) & 0xc0) == 0x80) {
1270 if (((byte2 = *t++) & 0xc0) == 0x80) {
1271 /* valid 3-byte UTF-8 */
1272 *buffer++ = ((byte & 0x0f) << 12)
1273 | ((byte1 & 0x3f) << 6)
1274 | ((byte2 & 0x3f) );
1277 *buffer++ = UNICODE_REPLACEMENT;
1282 *buffer++ = UNICODE_REPLACEMENT;
1286 else if ((byte & 0xf8) == 0xf0) {
1287 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1289 if (t + 3 > tlimit) {
1290 *buffer++ = UNICODE_REPLACEMENT;
1294 if (((byte1 = *t++) & 0xc0) == 0x80) {
1295 if (((byte2 = *t++) & 0xc0) == 0x80) {
1296 if (((byte3 = *t++) & 0xc0) == 0x80) {
1297 /* valid 4-byte UTF-8? */
1298 value = ((byte & 0x07) << 18)
1299 | ((byte1 & 0x3f) << 12)
1300 | ((byte2 & 0x3f) << 6)
1301 | ((byte3 & 0x3f) );
1303 if (value > 0x10FFFF) {
1304 *buffer++ = UNICODE_REPLACEMENT;
1306 else if (value > 0xFFFF) {
1307 /* we need surrogates */
1308 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1309 *buffer++ = 0xdc00 | (value & 0x03ff);
1312 *buffer++ = value; /* 16bit suffice */
1315 *buffer++ = UNICODE_REPLACEMENT;
1320 *buffer++ = UNICODE_REPLACEMENT;
1325 *buffer++ = UNICODE_REPLACEMENT;
1329 else if ((byte & 0xfc) == 0xf8) {
1330 if (t + 4 > tlimit) {
1331 *buffer++ = UNICODE_REPLACEMENT;
1336 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1338 *buffer++ = UNICODE_REPLACEMENT;
1340 else if ((byte & 0xfe) == 0xfc) {
1341 if (t + 5 > tlimit) {
1342 *buffer++ = UNICODE_REPLACEMENT;
1347 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1349 *buffer++ = UNICODE_REPLACEMENT;
1352 *buffer++ = UNICODE_REPLACEMENT;
1360 /* ASCII character, common case */
1368 /* u2_utflength ****************************************************************
1370 Returns the utf length in bytes of a u2 array.
1372 *******************************************************************************/
1374 u4 u2_utflength(u2 *text, u4 u2_length)
1376 u4 result_len = 0; /* utf length in bytes */
1377 u2 ch; /* current unicode character */
1380 for (len = 0; len < u2_length; len++) {
1381 /* next unicode character */
1384 /* determine bytes required to store unicode character as utf */
1385 if (ch && (ch < 0x80))
1387 else if (ch < 0x800)
1397 /* utf_copy ********************************************************************
1399 Copy the given utf string byte-for-byte to a buffer.
1402 buffer.......the buffer
1403 u............the utf string
1405 *******************************************************************************/
1407 void utf_copy(char *buffer, utf *u)
1409 /* our utf strings are zero-terminated (done by utf_new) */
1410 MCOPY(buffer, u->text, char, u->blength + 1);
1414 /* utf_cat *********************************************************************
1416 Append the given utf string byte-for-byte to a buffer.
1419 buffer.......the buffer
1420 u............the utf string
1422 *******************************************************************************/
1424 void utf_cat(char *buffer, utf *u)
1426 /* our utf strings are zero-terminated (done by utf_new) */
1427 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1431 /* utf_copy_classname **********************************************************
1433 Copy the given utf classname byte-for-byte to a buffer.
1434 '/' is replaced by '.'
1437 buffer.......the buffer
1438 u............the utf string
1440 *******************************************************************************/
1442 void utf_copy_classname(char *buffer, utf *u)
1451 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1453 while (srcptr != endptr) {
1462 /* utf_cat *********************************************************************
1464 Append the given utf classname byte-for-byte to a buffer.
1465 '/' is replaced by '.'
1468 buffer.......the buffer
1469 u............the utf string
1471 *******************************************************************************/
1473 void utf_cat_classname(char *buffer, utf *u)
1475 utf_copy_classname(buffer + strlen(buffer), u);
1478 /* utf_display_printable_ascii *************************************************
1480 Write utf symbol to stdout (for debugging purposes).
1481 Non-printable and non-ASCII characters are printed as '?'.
1483 *******************************************************************************/
1485 void utf_display_printable_ascii(utf *u)
1487 char *endpos; /* points behind utf string */
1488 char *utf_ptr; /* current position in utf text */
1496 endpos = UTF_END(u);
1499 while (utf_ptr < endpos) {
1500 /* read next unicode character */
1502 u2 c = utf_nextu2(&utf_ptr);
1504 if ((c >= 32) && (c <= 127))
1514 /* utf_display_printable_ascii_classname ***************************************
1516 Write utf symbol to stdout with `/' converted to `.' (for debugging
1518 Non-printable and non-ASCII characters are printed as '?'.
1520 *******************************************************************************/
1522 void utf_display_printable_ascii_classname(utf *u)
1524 char *endpos; /* points behind utf string */
1525 char *utf_ptr; /* current position in utf text */
1533 endpos = UTF_END(u);
1536 while (utf_ptr < endpos) {
1537 /* read next unicode character */
1539 u2 c = utf_nextu2(&utf_ptr);
1544 if ((c >= 32) && (c <= 127))
1554 /* utf_sprint_convert_to_latin1 ************************************************
1556 Write utf symbol into c-string (for debugging purposes).
1557 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1560 *******************************************************************************/
1562 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1564 char *endpos; /* points behind utf string */
1565 char *utf_ptr; /* current position in utf text */
1566 u2 pos = 0; /* position in c-string */
1569 strcpy(buffer, "NULL");
1573 endpos = UTF_END(u);
1576 while (utf_ptr < endpos)
1577 /* copy next unicode character */
1578 buffer[pos++] = utf_nextu2(&utf_ptr);
1580 /* terminate string */
1585 /* utf_sprint_convert_to_latin1_classname **************************************
1587 Write utf symbol into c-string with `/' converted to `.' (for debugging
1589 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1592 *******************************************************************************/
1594 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1596 char *endpos; /* points behind utf string */
1597 char *utf_ptr; /* current position in utf text */
1598 u2 pos = 0; /* position in c-string */
1601 strcpy(buffer, "NULL");
1605 endpos = UTF_END(u);
1608 while (utf_ptr < endpos) {
1609 /* copy next unicode character */
1610 u2 c = utf_nextu2(&utf_ptr);
1611 if (c == '/') c = '.';
1615 /* terminate string */
1620 /* utf_strcat_convert_to_latin1 ************************************************
1622 Like libc strcat, but uses an utf8 string.
1623 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1626 *******************************************************************************/
1628 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1630 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1634 /* utf_strcat_convert_to_latin1_classname **************************************
1636 Like libc strcat, but uses an utf8 string.
1637 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1640 *******************************************************************************/
1642 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1644 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1648 /* utf_fprint_printable_ascii **************************************************
1650 Write utf symbol into file.
1651 Non-printable and non-ASCII characters are printed as '?'.
1653 *******************************************************************************/
1655 void utf_fprint_printable_ascii(FILE *file, utf *u)
1657 char *endpos; /* points behind utf string */
1658 char *utf_ptr; /* current position in utf text */
1663 endpos = UTF_END(u);
1666 while (utf_ptr < endpos) {
1667 /* read next unicode character */
1668 u2 c = utf_nextu2(&utf_ptr);
1670 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1671 else fprintf(file, "?");
1676 /* utf_fprint_printable_ascii_classname ****************************************
1678 Write utf symbol into file with `/' converted to `.'.
1679 Non-printable and non-ASCII characters are printed as '?'.
1681 *******************************************************************************/
1683 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1685 char *endpos; /* points behind utf string */
1686 char *utf_ptr; /* current position in utf text */
1691 endpos = UTF_END(u);
1694 while (utf_ptr < endpos) {
1695 /* read next unicode character */
1696 u2 c = utf_nextu2(&utf_ptr);
1697 if (c == '/') c = '.';
1699 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1700 else fprintf(file, "?");
1705 /* is_valid_utf ****************************************************************
1707 Return true if the given string is a valid UTF-8 string.
1709 utf_ptr...points to first character
1710 end_pos...points after last character
1712 *******************************************************************************/
1714 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1716 bool is_valid_utf(char *utf_ptr, char *end_pos)
1723 if (end_pos < utf_ptr) return false;
1724 bytes = end_pos - utf_ptr;
1728 if (!c) return false; /* 0x00 is not allowed */
1729 if ((c & 0x80) == 0) continue; /* ASCII */
1731 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1732 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1733 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1734 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1735 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1736 else return false; /* invalid leading byte */
1738 if (len > 2) return false; /* Java limitation */
1740 v = (unsigned long)c & (0x3f >> len);
1742 if ((bytes -= len) < 0) return false; /* missing bytes */
1744 for (i = len; i--; ) {
1746 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1748 v = (v << 6) | (c & 0x3f);
1752 if (len != 1) return false; /* Java special */
1755 /* Sun Java seems to allow overlong UTF-8 encodings */
1757 /* if (v < min_codepoint[len]) */
1758 /* XXX throw exception? */
1761 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1762 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1764 /* even these seem to be allowed */
1765 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1772 /* is_valid_name ***************************************************************
1774 Return true if the given string may be used as a class/field/method
1775 name. (Currently this only disallows empty strings and control
1778 NOTE: The string is assumed to have passed is_valid_utf!
1780 utf_ptr...points to first character
1781 end_pos...points after last character
1783 *******************************************************************************/
1785 bool is_valid_name(char *utf_ptr, char *end_pos)
1787 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1789 while (utf_ptr < end_pos) {
1790 unsigned char c = *utf_ptr++;
1792 if (c < 0x20) return false; /* disallow control characters */
1793 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1800 bool is_valid_name_utf(utf *u)
1802 return is_valid_name(u->text, UTF_END(u));
1806 /* utf_show ********************************************************************
1808 Writes the utf symbols in the utfhash to stdout and displays the
1809 number of external hash chains grouped according to the chainlength
1810 (for debugging purposes).
1812 *******************************************************************************/
1814 #if !defined(NDEBUG)
1818 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1820 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1821 u4 max_chainlength = 0; /* maximum length of the chains */
1822 u4 sum_chainlength = 0; /* sum of the chainlengths */
1823 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1826 printf("UTF-HASH:\n");
1828 /* show element of utf-hashtable */
1830 for (i = 0; i < hashtable_utf->size; i++) {
1831 utf *u = hashtable_utf->ptr[i];
1834 printf("SLOT %d: ", (int) i);
1838 utf_display_printable_ascii(u);
1846 printf("UTF-HASH: %d slots for %d entries\n",
1847 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1849 if (hashtable_utf->entries == 0)
1852 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1854 for (i=0;i<CHAIN_LIMIT;i++)
1857 /* count numbers of hashchains according to their length */
1858 for (i=0; i<hashtable_utf->size; i++) {
1860 utf *u = (utf*) hashtable_utf->ptr[i];
1861 u4 chain_length = 0;
1863 /* determine chainlength */
1869 /* update sum of all chainlengths */
1870 sum_chainlength+=chain_length;
1872 /* determine the maximum length of the chains */
1873 if (chain_length>max_chainlength)
1874 max_chainlength = chain_length;
1876 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1877 if (chain_length>=CHAIN_LIMIT) {
1878 beyond_limit+=chain_length;
1879 chain_length=CHAIN_LIMIT-1;
1882 /* update number of hashchains of current length */
1883 chain_count[chain_length]++;
1886 /* display results */
1887 for (i=1;i<CHAIN_LIMIT-1;i++)
1888 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1890 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1893 printf("max. chainlength:%5d\n",max_chainlength);
1895 /* avg. chainlength = sum of chainlengths / number of chains */
1896 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1898 #endif /* !defined(NDEBUG) */
1902 * These are local overrides for various environment variables in Emacs.
1903 * Please do not remove this and leave it at the end of the file, where
1904 * Emacs will automagically detect them.
1905 * ---------------------------------------------------------------------
1908 * indent-tabs-mode: t
1912 * vim:noexpandtab:sw=4:ts=4: