1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 $Id: utf8.c 8299 2007-08-13 08:41:18Z michi $
37 #include "mm/memory.h"
39 #include "threads/lock-common.h"
41 #include "toolbox/hashtable.h"
43 #include "vm/exceptions.h"
45 #include "vmcore/options.h"
47 #if defined(ENABLE_STATISTICS)
48 # include "vmcore/statistics.h"
51 #include "vmcore/utf8.h"
54 /* global variables ***********************************************************/
56 /* hashsize must be power of 2 */
58 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
60 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
63 /* utf-symbols for pointer comparison of frequently used strings **************/
65 utf *utf_java_lang_Object;
67 utf *utf_java_lang_Class;
68 utf *utf_java_lang_ClassLoader;
69 utf *utf_java_lang_Cloneable;
70 utf *utf_java_lang_SecurityManager;
71 utf *utf_java_lang_String;
72 utf *utf_java_lang_System;
73 utf *utf_java_lang_ThreadGroup;
74 utf *utf_java_lang_ref_SoftReference;
75 utf *utf_java_lang_ref_WeakReference;
76 utf *utf_java_lang_ref_PhantomReference;
77 utf *utf_java_io_Serializable;
79 utf *utf_java_lang_Throwable;
80 utf *utf_java_lang_Error;
82 utf *utf_java_lang_AbstractMethodError;
83 utf *utf_java_lang_ClassCircularityError;
84 utf *utf_java_lang_ClassFormatError;
85 utf *utf_java_lang_ExceptionInInitializerError;
86 utf *utf_java_lang_IncompatibleClassChangeError;
87 utf *utf_java_lang_InstantiationError;
88 utf *utf_java_lang_InternalError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchFieldError;
92 utf *utf_java_lang_NoSuchMethodError;
93 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_UnsatisfiedLinkError;
95 utf *utf_java_lang_UnsupportedClassVersionError;
96 utf *utf_java_lang_VerifyError;
97 utf *utf_java_lang_VirtualMachineError;
99 #if defined(WITH_CLASSPATH_GNU)
100 utf *utf_java_lang_VMThrowable;
103 utf *utf_java_lang_Exception;
105 utf *utf_java_lang_ArithmeticException;
106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
107 utf *utf_java_lang_ArrayStoreException;
108 utf *utf_java_lang_ClassCastException;
109 utf *utf_java_lang_ClassNotFoundException;
110 utf *utf_java_lang_CloneNotSupportedException;
111 utf *utf_java_lang_IllegalAccessException;
112 utf *utf_java_lang_IllegalArgumentException;
113 utf *utf_java_lang_IllegalMonitorStateException;
114 utf *utf_java_lang_InstantiationException;
115 utf *utf_java_lang_InterruptedException;
116 utf *utf_java_lang_NegativeArraySizeException;
117 utf *utf_java_lang_NullPointerException;
118 utf *utf_java_lang_StringIndexOutOfBoundsException;
120 utf *utf_java_lang_reflect_InvocationTargetException;
122 utf *utf_java_security_PrivilegedActionException;
124 #if defined(ENABLE_JAVASE)
125 utf* utf_java_lang_Void;
128 utf* utf_java_lang_Boolean;
129 utf* utf_java_lang_Byte;
130 utf* utf_java_lang_Character;
131 utf* utf_java_lang_Short;
132 utf* utf_java_lang_Integer;
133 utf* utf_java_lang_Long;
134 utf* utf_java_lang_Float;
135 utf* utf_java_lang_Double;
137 #if defined(ENABLE_JAVASE)
138 utf *utf_java_lang_StackTraceElement;
139 utf *utf_java_lang_reflect_Constructor;
140 utf *utf_java_lang_reflect_Field;
141 utf *utf_java_lang_reflect_Method;
142 utf *utf_java_util_Vector;
145 utf *utf_InnerClasses; /* InnerClasses */
146 utf *utf_ConstantValue; /* ConstantValue */
147 utf *utf_Code; /* Code */
148 utf *utf_Exceptions; /* Exceptions */
149 utf *utf_LineNumberTable; /* LineNumberTable */
150 utf *utf_SourceFile; /* SourceFile */
152 #if defined(ENABLE_JAVASE)
153 utf *utf_EnclosingMethod;
155 utf *utf_StackMapTable;
157 #if defined(ENABLE_ANNOTATIONS)
158 utf *utf_sun_reflect_ConstantPool;
159 #if defined(WITH_CLASSPATH_GNU)
160 utf *utf_sun_reflect_annotation_AnnotationParser;
163 utf *utf_RuntimeVisibleAnnotations;
164 utf *utf_RuntimeInvisibleAnnotations;
165 utf *utf_RuntimeVisibleParameterAnnotations;
166 utf *utf_RuntimeInvisibleParameterAnnotations;
167 utf *utf_AnnotationDefault;
171 utf *utf_init; /* <init> */
172 utf *utf_clinit; /* <clinit> */
173 utf *utf_clone; /* clone */
174 utf *utf_finalize; /* finalize */
175 utf *utf_run; /* run */
180 utf *utf_removeThread;
185 utf *utf_fillInStackTrace;
187 utf *utf_getSystemClassLoader;
190 utf *utf_printStackTrace;
192 utf *utf_division_by_zero;
203 utf *utf_void__void; /* ()V */
204 utf *utf_boolean__void; /* (Z)V */
205 utf *utf_byte__void; /* (B)V */
206 utf *utf_char__void; /* (C)V */
207 utf *utf_short__void; /* (S)V */
208 utf *utf_int__void; /* (I)V */
209 utf *utf_long__void; /* (J)V */
210 utf *utf_float__void; /* (F)V */
211 utf *utf_double__void; /* (D)V */
213 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
214 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
215 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
216 utf *utf_java_lang_ClassLoader_java_lang_String__J;
217 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
218 utf *utf_java_lang_Object__java_lang_Object;
219 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
220 utf *utf_java_lang_String__java_lang_Class;
221 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
222 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
223 utf *utf_java_lang_Throwable__java_lang_Throwable;
225 utf *utf_not_named_yet; /* special name for unnamed classes */
227 utf *array_packagename;
230 /* utf_init ********************************************************************
232 Initializes the utf8 subsystem.
234 *******************************************************************************/
238 /* create utf8 hashtable */
240 hashtable_utf = NEW(hashtable);
242 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
244 #if defined(ENABLE_STATISTICS)
246 count_utf_len += sizeof(utf*) * hashtable_utf->size;
249 /* create utf-symbols for pointer comparison of frequently used strings */
251 utf_java_lang_Object = utf_new_char("java/lang/Object");
253 utf_java_lang_Class = utf_new_char("java/lang/Class");
254 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
255 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
256 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
257 utf_java_lang_String = utf_new_char("java/lang/String");
258 utf_java_lang_System = utf_new_char("java/lang/System");
259 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
261 utf_java_lang_ref_SoftReference =
262 utf_new_char("java/lang/ref/SoftReference");
264 utf_java_lang_ref_WeakReference =
265 utf_new_char("java/lang/ref/WeakReference");
267 utf_java_lang_ref_PhantomReference =
268 utf_new_char("java/lang/ref/PhantomReference");
270 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
272 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
273 utf_java_lang_Error = utf_new_char("java/lang/Error");
275 utf_java_lang_ClassCircularityError =
276 utf_new_char("java/lang/ClassCircularityError");
278 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
280 utf_java_lang_ExceptionInInitializerError =
281 utf_new_char("java/lang/ExceptionInInitializerError");
283 utf_java_lang_IncompatibleClassChangeError =
284 utf_new_char("java/lang/IncompatibleClassChangeError");
286 utf_java_lang_InstantiationError =
287 utf_new_char("java/lang/InstantiationError");
289 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
290 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
292 utf_java_lang_NoClassDefFoundError =
293 utf_new_char("java/lang/NoClassDefFoundError");
295 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
297 utf_java_lang_UnsatisfiedLinkError =
298 utf_new_char("java/lang/UnsatisfiedLinkError");
300 utf_java_lang_UnsupportedClassVersionError =
301 utf_new_char("java/lang/UnsupportedClassVersionError");
303 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
305 utf_java_lang_VirtualMachineError =
306 utf_new_char("java/lang/VirtualMachineError");
308 #if defined(ENABLE_JAVASE)
309 utf_java_lang_AbstractMethodError =
310 utf_new_char("java/lang/AbstractMethodError");
312 utf_java_lang_NoSuchFieldError =
313 utf_new_char("java/lang/NoSuchFieldError");
315 utf_java_lang_NoSuchMethodError =
316 utf_new_char("java/lang/NoSuchMethodError");
319 #if defined(WITH_CLASSPATH_GNU)
320 utf_java_lang_VMThrowable = utf_new_char("java/lang/VMThrowable");
323 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
325 utf_java_lang_ArithmeticException =
326 utf_new_char("java/lang/ArithmeticException");
328 utf_java_lang_ArrayIndexOutOfBoundsException =
329 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
331 utf_java_lang_ArrayStoreException =
332 utf_new_char("java/lang/ArrayStoreException");
334 utf_java_lang_ClassCastException =
335 utf_new_char("java/lang/ClassCastException");
337 utf_java_lang_ClassNotFoundException =
338 utf_new_char("java/lang/ClassNotFoundException");
340 utf_java_lang_CloneNotSupportedException =
341 utf_new_char("java/lang/CloneNotSupportedException");
343 utf_java_lang_IllegalAccessException =
344 utf_new_char("java/lang/IllegalAccessException");
346 utf_java_lang_IllegalArgumentException =
347 utf_new_char("java/lang/IllegalArgumentException");
349 utf_java_lang_IllegalMonitorStateException =
350 utf_new_char("java/lang/IllegalMonitorStateException");
352 utf_java_lang_InstantiationException =
353 utf_new_char("java/lang/InstantiationException");
355 utf_java_lang_InterruptedException =
356 utf_new_char("java/lang/InterruptedException");
358 utf_java_lang_NegativeArraySizeException =
359 utf_new_char("java/lang/NegativeArraySizeException");
361 utf_java_lang_NullPointerException =
362 utf_new_char("java/lang/NullPointerException");
364 utf_java_lang_StringIndexOutOfBoundsException =
365 utf_new_char("java/lang/StringIndexOutOfBoundsException");
367 utf_java_lang_reflect_InvocationTargetException =
368 utf_new_char("java/lang/reflect/InvocationTargetException");
370 utf_java_security_PrivilegedActionException =
371 utf_new_char("java/security/PrivilegedActionException");
373 #if defined(ENABLE_JAVASE)
374 utf_java_lang_Void = utf_new_char("java/lang/Void");
377 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
378 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
379 utf_java_lang_Character = utf_new_char("java/lang/Character");
380 utf_java_lang_Short = utf_new_char("java/lang/Short");
381 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
382 utf_java_lang_Long = utf_new_char("java/lang/Long");
383 utf_java_lang_Float = utf_new_char("java/lang/Float");
384 utf_java_lang_Double = utf_new_char("java/lang/Double");
386 #if defined(ENABLE_JAVASE)
387 utf_java_lang_StackTraceElement =
388 utf_new_char("java/lang/StackTraceElement");
390 utf_java_lang_reflect_Constructor =
391 utf_new_char("java/lang/reflect/Constructor");
393 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
394 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
395 utf_java_util_Vector = utf_new_char("java/util/Vector");
398 utf_InnerClasses = utf_new_char("InnerClasses");
399 utf_ConstantValue = utf_new_char("ConstantValue");
400 utf_Code = utf_new_char("Code");
401 utf_Exceptions = utf_new_char("Exceptions");
402 utf_LineNumberTable = utf_new_char("LineNumberTable");
403 utf_SourceFile = utf_new_char("SourceFile");
405 #if defined(ENABLE_JAVASE)
406 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
407 utf_Signature = utf_new_char("Signature");
408 utf_StackMapTable = utf_new_char("StackMapTable");
410 #if defined(ENABLE_ANNOTATIONS)
411 utf_sun_reflect_ConstantPool = utf_new_char("sun/reflect/ConstantPool");
412 #if defined(WITH_CLASSPATH_GNU)
413 utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
416 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
417 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
418 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
419 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
420 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
424 utf_init = utf_new_char("<init>");
425 utf_clinit = utf_new_char("<clinit>");
426 utf_clone = utf_new_char("clone");
427 utf_finalize = utf_new_char("finalize");
428 utf_run = utf_new_char("run");
430 utf_add = utf_new_char("add");
431 utf_remove = utf_new_char("remove");
432 utf_addThread = utf_new_char("addThread");
433 utf_removeThread = utf_new_char("removeThread");
434 utf_put = utf_new_char("put");
435 utf_get = utf_new_char("get");
436 utf_value = utf_new_char("value");
438 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
439 utf_findNative = utf_new_char("findNative");
440 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
441 utf_initCause = utf_new_char("initCause");
442 utf_loadClass = utf_new_char("loadClass");
443 utf_printStackTrace = utf_new_char("printStackTrace");
445 utf_division_by_zero = utf_new_char("/ by zero");
447 utf_Z = utf_new_char("Z");
448 utf_B = utf_new_char("B");
449 utf_C = utf_new_char("C");
450 utf_S = utf_new_char("S");
451 utf_I = utf_new_char("I");
452 utf_J = utf_new_char("J");
453 utf_F = utf_new_char("F");
454 utf_D = utf_new_char("D");
456 utf_void__void = utf_new_char("()V");
457 utf_boolean__void = utf_new_char("(Z)V");
458 utf_byte__void = utf_new_char("(B)V");
459 utf_char__void = utf_new_char("(C)V");
460 utf_short__void = utf_new_char("(S)V");
461 utf_int__void = utf_new_char("(I)V");
462 utf_long__void = utf_new_char("(J)V");
463 utf_float__void = utf_new_char("(F)V");
464 utf_double__void = utf_new_char("(D)V");
465 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
466 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
468 utf_void__java_lang_ClassLoader =
469 utf_new_char("()Ljava/lang/ClassLoader;");
471 utf_java_lang_ClassLoader_java_lang_String__J =
472 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
474 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
476 utf_java_lang_Object__java_lang_Object =
477 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
479 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
481 utf_java_lang_String__java_lang_Class =
482 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
484 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
485 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
487 utf_java_lang_Throwable__java_lang_Throwable =
488 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
490 utf_null = utf_new_char("null");
491 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
492 array_packagename = utf_new_char("\t<the array package>");
494 /* everything's ok */
500 /* utf_hashkey *****************************************************************
502 The hashkey is computed from the utf-text by using up to 8
503 characters. For utf-symbols longer than 15 characters 3 characters
504 are taken from the beginning and the end, 2 characters are taken
507 *******************************************************************************/
509 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
510 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
512 u4 utf_hashkey(const char *text, u4 length)
514 const char *start_pos = text; /* pointer to utf text */
518 case 0: /* empty string */
521 case 1: return fbs(0);
522 case 2: return fbs(0) ^ nbs(3);
523 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
524 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
525 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
526 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
527 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
528 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
535 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
544 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
553 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
565 return a ^ nbs(9) ^ nbs(10);
577 return a ^ nbs(9) ^ nbs(10);
588 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
599 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
601 default: /* 3 characters from beginning */
607 /* 2 characters from middle */
608 text = start_pos + (length / 2);
613 /* 3 characters from end */
614 text = start_pos + length - 4;
619 return a ^ nbs(10) ^ nbs(11);
623 /* utf_full_hashkey ************************************************************
625 This function computes a hash value using all bytes in the string.
627 The algorithm is the "One-at-a-time" algorithm as published
628 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
630 *******************************************************************************/
632 u4 utf_full_hashkey(const char *text, u4 length)
634 register const unsigned char *p = (const unsigned char *) text;
642 hash += (hash << 10);
646 hash ^= (hash >> 11);
647 hash += (hash << 15);
652 /* unicode_hashkey *************************************************************
654 Compute the hashkey of a unicode string.
656 *******************************************************************************/
658 u4 unicode_hashkey(u2 *text, u2 len)
660 return utf_hashkey((char *) text, len);
664 /* utf_new *********************************************************************
666 Creates a new utf-symbol, the text of the symbol is passed as a
667 u1-array. The function searches the utf-hashtable for a utf-symbol
668 with this text. On success the element returned, otherwise a new
669 hashtable element is created.
671 If the number of entries in the hashtable exceeds twice the size of
672 the hashtable slots a reorganization of the hashtable is done and
673 the utf symbols are copied to a new hashtable with doubled size.
675 *******************************************************************************/
677 utf *utf_new(const char *text, u2 length)
679 u4 key; /* hashkey computed from utf-text */
680 u4 slot; /* slot in hashtable */
681 utf *u; /* hashtable element */
684 LOCK_MONITOR_ENTER(hashtable_utf->header);
686 #if defined(ENABLE_STATISTICS)
691 key = utf_hashkey(text, length);
692 slot = key & (hashtable_utf->size - 1);
693 u = hashtable_utf->ptr[slot];
695 /* search external hash chain for utf-symbol */
698 if (u->blength == length) {
699 /* compare text of hashtable elements */
701 for (i = 0; i < length; i++)
702 if (text[i] != u->text[i])
705 #if defined(ENABLE_STATISTICS)
707 count_utf_new_found++;
710 /* symbol found in hashtable */
712 LOCK_MONITOR_EXIT(hashtable_utf->header);
718 u = u->hashlink; /* next element in external chain */
721 /* location in hashtable found, create new utf element */
725 u->blength = length; /* length in bytes of utfstring */
726 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
727 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
729 memcpy(u->text, text, length); /* copy utf-text */
730 u->text[length] = '\0';
732 #if defined(ENABLE_STATISTICS)
734 count_utf_len += sizeof(utf) + length + 1;
737 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
738 hashtable_utf->entries++; /* update number of entries */
740 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
742 /* reorganization of hashtable, average length of the external
743 chains is approx. 2 */
745 hashtable *newhash; /* the new hashtable */
751 /* create new hashtable, double the size */
753 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
755 #if defined(ENABLE_STATISTICS)
757 count_utf_len += sizeof(utf*) * hashtable_utf->size;
760 /* transfer elements to new hashtable */
762 for (i = 0; i < hashtable_utf->size; i++) {
763 u = hashtable_utf->ptr[i];
767 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
769 u->hashlink = (utf *) newhash->ptr[slot];
770 newhash->ptr[slot] = u;
772 /* follow link in external hash chain */
778 /* dispose old table */
780 hashtable_free(hashtable_utf);
782 hashtable_utf = newhash;
785 LOCK_MONITOR_EXIT(hashtable_utf->header);
791 /* utf_new_u2 ******************************************************************
793 Make utf symbol from u2 array, if isclassname is true '.' is
796 *******************************************************************************/
798 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
800 char *buffer; /* memory buffer for unicode characters */
801 char *pos; /* pointer to current position in buffer */
802 u4 left; /* unicode characters left */
803 u4 buflength; /* utf length in bytes of the u2 array */
804 utf *result; /* resulting utf-string */
807 /* determine utf length in bytes and allocate memory */
809 buflength = u2_utflength(unicode_pos, unicode_length);
810 buffer = MNEW(char, buflength);
815 for (i = 0; i++ < unicode_length; unicode_pos++) {
816 /* next unicode character */
819 if ((c != 0) && (c < 0x80)) {
822 if ((int) left < 0) break;
823 /* convert classname */
824 if (isclassname && c == '.')
829 } else if (c < 0x800) {
831 unsigned char high = c >> 6;
832 unsigned char low = c & 0x3F;
834 if ((int) left < 0) break;
835 *pos++ = high | 0xC0;
841 char mid = (c >> 6) & 0x3F;
844 if ((int) left < 0) break;
845 *pos++ = high | 0xE0;
851 /* insert utf-string into symbol-table */
852 result = utf_new(buffer,buflength);
854 MFREE(buffer, char, buflength);
860 /* utf_new_char ****************************************************************
862 Creates a new utf symbol, the text for this symbol is passed as a
863 c-string ( = char* ).
865 *******************************************************************************/
867 utf *utf_new_char(const char *text)
869 return utf_new(text, strlen(text));
873 /* utf_new_char_classname ******************************************************
875 Creates a new utf symbol, the text for this symbol is passed as a
876 c-string ( = char* ) "." characters are going to be replaced by
877 "/". Since the above function is used often, this is a separte
878 function, instead of an if.
880 *******************************************************************************/
882 utf *utf_new_char_classname(const char *text)
884 if (strchr(text, '.')) {
885 char *txt = strdup(text);
886 char *end = txt + strlen(txt);
890 for (c = txt; c < end; c++)
891 if (*c == '.') *c = '/';
893 tmpRes = utf_new(txt, strlen(txt));
899 return utf_new(text, strlen(text));
903 /* utf_nextu2 ******************************************************************
905 Read the next unicode character from the utf string and increment
906 the utf-string pointer accordingly.
908 CAUTION: This function is unsafe for input that was not checked
911 *******************************************************************************/
913 u2 utf_nextu2(char **utf_ptr)
915 /* uncompressed unicode character */
917 /* current position in utf text */
918 unsigned char *utf = (unsigned char *) (*utf_ptr);
919 /* bytes representing the unicode character */
920 unsigned char ch1, ch2, ch3;
921 /* number of bytes used to represent the unicode character */
924 switch ((ch1 = utf[0]) >> 4) {
925 default: /* 1 byte */
929 case 0xD: /* 2 bytes */
930 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
931 unsigned char high = ch1 & 0x1F;
932 unsigned char low = ch2 & 0x3F;
933 unicode_char = (high << 6) + low;
938 case 0xE: /* 2 or 3 bytes */
939 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
940 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
941 unsigned char low = ch3 & 0x3f;
942 unsigned char mid = ch2 & 0x3f;
943 unsigned char high = ch1 & 0x0f;
944 unicode_char = (((high << 6) + mid) << 6) + low;
952 /* update position in utf-text */
953 *utf_ptr = (char *) (utf + len);
959 /* utf_bytes *******************************************************************
961 Determine number of bytes (aka. octets) in the utf string.
964 u............utf string
967 The number of octets of this utf string.
968 There is _no_ terminating zero included in this count.
970 *******************************************************************************/
978 /* utf_get_number_of_u2s_for_buffer ********************************************
980 Determine number of UTF-16 u2s in the given UTF-8 buffer
982 CAUTION: This function is unsafe for input that was not checked
985 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
986 to an array of u2s (UTF-16) and want to know how many of them you will get.
987 All other uses of this function are probably wrong.
990 buffer........points to first char in buffer
991 blength.......number of _bytes_ in the buffer
994 the number of u2s needed to hold this string in UTF-16 encoding.
995 There is _no_ terminating zero included in this count.
997 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1000 *******************************************************************************/
1002 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1004 const char *endpos; /* points behind utf string */
1005 const char *utf_ptr; /* current position in utf text */
1006 u4 len = 0; /* number of unicode characters */
1009 endpos = utf_ptr + blength;
1011 while (utf_ptr < endpos) {
1013 /* next unicode character */
1014 utf_nextu2((char **)&utf_ptr);
1017 assert(utf_ptr == endpos);
1023 /* utf_get_number_of_u2s *******************************************************
1025 Determine number of UTF-16 u2s in the utf string.
1027 CAUTION: This function is unsafe for input that was not checked
1030 CAUTION: Use this function *only* when you want to convert a utf string
1031 to an array of u2s and want to know how many of them you will get.
1032 All other uses of this function are probably wrong.
1035 u............utf string
1038 the number of u2s needed to hold this string in UTF-16 encoding.
1039 There is _no_ terminating zero included in this count.
1040 XXX 0 if a NullPointerException has been thrown (see below)
1042 *******************************************************************************/
1044 u4 utf_get_number_of_u2s(utf *u)
1046 char *endpos; /* points behind utf string */
1047 char *utf_ptr; /* current position in utf text */
1048 u4 len = 0; /* number of unicode characters */
1050 /* XXX this is probably not checked by most callers! Review this after */
1051 /* the invalid uses of this function have been eliminated */
1053 exceptions_throw_nullpointerexception();
1057 endpos = UTF_END(u);
1060 while (utf_ptr < endpos) {
1062 /* next unicode character */
1063 utf_nextu2(&utf_ptr);
1066 if (utf_ptr != endpos) {
1067 /* string ended abruptly */
1068 exceptions_throw_internalerror("Illegal utf8 string");
1076 /* utf8_safe_number_of_u2s *****************************************************
1078 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1079 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1081 This function is safe even for invalid UTF-8 strings.
1084 text..........zero-terminated(!) UTF-8 string (may be invalid)
1086 nbytes........strlen(text). (This is needed to completely emulate
1090 the number of u2s needed to hold this string in UTF-16 encoding.
1091 There is _no_ terminating zero included in this count.
1093 *******************************************************************************/
1095 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1096 register const unsigned char *t;
1099 register const unsigned char *tlimit;
1107 assert(nbytes >= 0);
1110 t = (const unsigned char *) text;
1111 tlimit = t + nbytes;
1113 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1119 /* highest bit set, non-ASCII character */
1121 if ((byte & 0xe0) == 0xc0) {
1122 /* 2-byte: should be 110..... 10...... ? */
1124 if ((*t++ & 0xc0) == 0x80)
1125 ; /* valid 2-byte */
1129 else if ((byte & 0xf0) == 0xe0) {
1130 /* 3-byte: should be 1110.... 10...... 10...... */
1134 return len + 1; /* invalid, stop here */
1136 if ((*t++ & 0xc0) == 0x80) {
1137 if ((*t++ & 0xc0) == 0x80)
1138 ; /* valid 3-byte */
1145 else if ((byte & 0xf8) == 0xf0) {
1146 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1150 return len + 1; /* invalid, stop here */
1152 if (((byte1 = *t++) & 0xc0) == 0x80) {
1153 if (((byte2 = *t++) & 0xc0) == 0x80) {
1154 if (((byte3 = *t++) & 0xc0) == 0x80) {
1155 /* valid 4-byte UTF-8? */
1156 value = ((byte & 0x07) << 18)
1157 | ((byte1 & 0x3f) << 12)
1158 | ((byte2 & 0x3f) << 6)
1159 | ((byte3 & 0x3f) );
1161 if (value > 0x10FFFF)
1163 else if (value > 0xFFFF)
1164 len += 1; /* we need surrogates */
1166 ; /* 16bit suffice */
1177 else if ((byte & 0xfc) == 0xf8) {
1178 /* invalid 5-byte */
1180 return len + 1; /* invalid, stop here */
1183 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1186 else if ((byte & 0xfe) == 0xfc) {
1187 /* invalid 6-byte */
1189 return len + 1; /* invalid, stop here */
1192 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1204 /* ASCII character, common case */
1214 /* utf8_safe_convert_to_u2s ****************************************************
1216 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1217 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1218 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1220 This function is safe even for invalid UTF-8 strings.
1223 text..........zero-terminated(!) UTF-8 string (may be invalid)
1225 nbytes........strlen(text). (This is needed to completely emulate
1227 buffer........a preallocated array of u2s to receive the decoded
1228 string. Use utf8_safe_number_of_u2s to get the
1229 required number of u2s for allocating this.
1231 *******************************************************************************/
1233 #define UNICODE_REPLACEMENT 0xfffd
1235 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1236 register const unsigned char *t;
1238 register const unsigned char *tlimit;
1246 assert(nbytes >= 0);
1248 t = (const unsigned char *) text;
1249 tlimit = t + nbytes;
1251 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1257 /* highest bit set, non-ASCII character */
1259 if ((byte & 0xe0) == 0xc0) {
1260 /* 2-byte: should be 110..... 10...... */
1262 if (((byte1 = *t++) & 0xc0) == 0x80) {
1263 /* valid 2-byte UTF-8 */
1264 *buffer++ = ((byte & 0x1f) << 6)
1265 | ((byte1 & 0x3f) );
1268 *buffer++ = UNICODE_REPLACEMENT;
1272 else if ((byte & 0xf0) == 0xe0) {
1273 /* 3-byte: should be 1110.... 10...... 10...... */
1275 if (t + 2 > tlimit) {
1276 *buffer++ = UNICODE_REPLACEMENT;
1280 if (((byte1 = *t++) & 0xc0) == 0x80) {
1281 if (((byte2 = *t++) & 0xc0) == 0x80) {
1282 /* valid 3-byte UTF-8 */
1283 *buffer++ = ((byte & 0x0f) << 12)
1284 | ((byte1 & 0x3f) << 6)
1285 | ((byte2 & 0x3f) );
1288 *buffer++ = UNICODE_REPLACEMENT;
1293 *buffer++ = UNICODE_REPLACEMENT;
1297 else if ((byte & 0xf8) == 0xf0) {
1298 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1300 if (t + 3 > tlimit) {
1301 *buffer++ = UNICODE_REPLACEMENT;
1305 if (((byte1 = *t++) & 0xc0) == 0x80) {
1306 if (((byte2 = *t++) & 0xc0) == 0x80) {
1307 if (((byte3 = *t++) & 0xc0) == 0x80) {
1308 /* valid 4-byte UTF-8? */
1309 value = ((byte & 0x07) << 18)
1310 | ((byte1 & 0x3f) << 12)
1311 | ((byte2 & 0x3f) << 6)
1312 | ((byte3 & 0x3f) );
1314 if (value > 0x10FFFF) {
1315 *buffer++ = UNICODE_REPLACEMENT;
1317 else if (value > 0xFFFF) {
1318 /* we need surrogates */
1319 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1320 *buffer++ = 0xdc00 | (value & 0x03ff);
1323 *buffer++ = value; /* 16bit suffice */
1326 *buffer++ = UNICODE_REPLACEMENT;
1331 *buffer++ = UNICODE_REPLACEMENT;
1336 *buffer++ = UNICODE_REPLACEMENT;
1340 else if ((byte & 0xfc) == 0xf8) {
1341 if (t + 4 > tlimit) {
1342 *buffer++ = UNICODE_REPLACEMENT;
1347 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1349 *buffer++ = UNICODE_REPLACEMENT;
1351 else if ((byte & 0xfe) == 0xfc) {
1352 if (t + 5 > tlimit) {
1353 *buffer++ = UNICODE_REPLACEMENT;
1358 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1360 *buffer++ = UNICODE_REPLACEMENT;
1363 *buffer++ = UNICODE_REPLACEMENT;
1371 /* ASCII character, common case */
1379 /* u2_utflength ****************************************************************
1381 Returns the utf length in bytes of a u2 array.
1383 *******************************************************************************/
1385 u4 u2_utflength(u2 *text, u4 u2_length)
1387 u4 result_len = 0; /* utf length in bytes */
1388 u2 ch; /* current unicode character */
1391 for (len = 0; len < u2_length; len++) {
1392 /* next unicode character */
1395 /* determine bytes required to store unicode character as utf */
1396 if (ch && (ch < 0x80))
1398 else if (ch < 0x800)
1408 /* utf_copy ********************************************************************
1410 Copy the given utf string byte-for-byte to a buffer.
1413 buffer.......the buffer
1414 u............the utf string
1416 *******************************************************************************/
1418 void utf_copy(char *buffer, utf *u)
1420 /* our utf strings are zero-terminated (done by utf_new) */
1421 MCOPY(buffer, u->text, char, u->blength + 1);
1425 /* utf_cat *********************************************************************
1427 Append the given utf string byte-for-byte to a buffer.
1430 buffer.......the buffer
1431 u............the utf string
1433 *******************************************************************************/
1435 void utf_cat(char *buffer, utf *u)
1437 /* our utf strings are zero-terminated (done by utf_new) */
1438 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1442 /* utf_copy_classname **********************************************************
1444 Copy the given utf classname byte-for-byte to a buffer.
1445 '/' is replaced by '.'
1448 buffer.......the buffer
1449 u............the utf string
1451 *******************************************************************************/
1453 void utf_copy_classname(char *buffer, utf *u)
1462 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1464 while (srcptr != endptr) {
1473 /* utf_cat *********************************************************************
1475 Append the given utf classname byte-for-byte to a buffer.
1476 '/' is replaced by '.'
1479 buffer.......the buffer
1480 u............the utf string
1482 *******************************************************************************/
1484 void utf_cat_classname(char *buffer, utf *u)
1486 utf_copy_classname(buffer + strlen(buffer), u);
1489 /* utf_display_printable_ascii *************************************************
1491 Write utf symbol to stdout (for debugging purposes).
1492 Non-printable and non-ASCII characters are printed as '?'.
1494 *******************************************************************************/
1496 void utf_display_printable_ascii(utf *u)
1498 char *endpos; /* points behind utf string */
1499 char *utf_ptr; /* current position in utf text */
1507 endpos = UTF_END(u);
1510 while (utf_ptr < endpos) {
1511 /* read next unicode character */
1513 u2 c = utf_nextu2(&utf_ptr);
1515 if ((c >= 32) && (c <= 127))
1525 /* utf_display_printable_ascii_classname ***************************************
1527 Write utf symbol to stdout with `/' converted to `.' (for debugging
1529 Non-printable and non-ASCII characters are printed as '?'.
1531 *******************************************************************************/
1533 void utf_display_printable_ascii_classname(utf *u)
1535 char *endpos; /* points behind utf string */
1536 char *utf_ptr; /* current position in utf text */
1544 endpos = UTF_END(u);
1547 while (utf_ptr < endpos) {
1548 /* read next unicode character */
1550 u2 c = utf_nextu2(&utf_ptr);
1555 if ((c >= 32) && (c <= 127))
1565 /* utf_sprint_convert_to_latin1 ************************************************
1567 Write utf symbol into c-string (for debugging purposes).
1568 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1571 *******************************************************************************/
1573 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1575 char *endpos; /* points behind utf string */
1576 char *utf_ptr; /* current position in utf text */
1577 u2 pos = 0; /* position in c-string */
1580 strcpy(buffer, "NULL");
1584 endpos = UTF_END(u);
1587 while (utf_ptr < endpos)
1588 /* copy next unicode character */
1589 buffer[pos++] = utf_nextu2(&utf_ptr);
1591 /* terminate string */
1596 /* utf_sprint_convert_to_latin1_classname **************************************
1598 Write utf symbol into c-string with `/' converted to `.' (for debugging
1600 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1603 *******************************************************************************/
1605 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1607 char *endpos; /* points behind utf string */
1608 char *utf_ptr; /* current position in utf text */
1609 u2 pos = 0; /* position in c-string */
1612 strcpy(buffer, "NULL");
1616 endpos = UTF_END(u);
1619 while (utf_ptr < endpos) {
1620 /* copy next unicode character */
1621 u2 c = utf_nextu2(&utf_ptr);
1622 if (c == '/') c = '.';
1626 /* terminate string */
1631 /* utf_strcat_convert_to_latin1 ************************************************
1633 Like libc strcat, but uses an utf8 string.
1634 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1637 *******************************************************************************/
1639 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1641 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1645 /* utf_strcat_convert_to_latin1_classname **************************************
1647 Like libc strcat, but uses an utf8 string.
1648 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1651 *******************************************************************************/
1653 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1655 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1659 /* utf_fprint_printable_ascii **************************************************
1661 Write utf symbol into file.
1662 Non-printable and non-ASCII characters are printed as '?'.
1664 *******************************************************************************/
1666 void utf_fprint_printable_ascii(FILE *file, utf *u)
1668 char *endpos; /* points behind utf string */
1669 char *utf_ptr; /* current position in utf text */
1674 endpos = UTF_END(u);
1677 while (utf_ptr < endpos) {
1678 /* read next unicode character */
1679 u2 c = utf_nextu2(&utf_ptr);
1681 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1682 else fprintf(file, "?");
1687 /* utf_fprint_printable_ascii_classname ****************************************
1689 Write utf symbol into file with `/' converted to `.'.
1690 Non-printable and non-ASCII characters are printed as '?'.
1692 *******************************************************************************/
1694 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1696 char *endpos; /* points behind utf string */
1697 char *utf_ptr; /* current position in utf text */
1702 endpos = UTF_END(u);
1705 while (utf_ptr < endpos) {
1706 /* read next unicode character */
1707 u2 c = utf_nextu2(&utf_ptr);
1708 if (c == '/') c = '.';
1710 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1711 else fprintf(file, "?");
1716 /* is_valid_utf ****************************************************************
1718 Return true if the given string is a valid UTF-8 string.
1720 utf_ptr...points to first character
1721 end_pos...points after last character
1723 *******************************************************************************/
1725 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1727 bool is_valid_utf(char *utf_ptr, char *end_pos)
1734 if (end_pos < utf_ptr) return false;
1735 bytes = end_pos - utf_ptr;
1739 if (!c) return false; /* 0x00 is not allowed */
1740 if ((c & 0x80) == 0) continue; /* ASCII */
1742 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1743 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1744 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1745 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1746 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1747 else return false; /* invalid leading byte */
1749 if (len > 2) return false; /* Java limitation */
1751 v = (unsigned long)c & (0x3f >> len);
1753 if ((bytes -= len) < 0) return false; /* missing bytes */
1755 for (i = len; i--; ) {
1757 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1759 v = (v << 6) | (c & 0x3f);
1763 if (len != 1) return false; /* Java special */
1766 /* Sun Java seems to allow overlong UTF-8 encodings */
1768 /* if (v < min_codepoint[len]) */
1769 /* XXX throw exception? */
1772 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1773 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1775 /* even these seem to be allowed */
1776 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1783 /* is_valid_name ***************************************************************
1785 Return true if the given string may be used as a class/field/method
1786 name. (Currently this only disallows empty strings and control
1789 NOTE: The string is assumed to have passed is_valid_utf!
1791 utf_ptr...points to first character
1792 end_pos...points after last character
1794 *******************************************************************************/
1796 bool is_valid_name(char *utf_ptr, char *end_pos)
1798 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1800 while (utf_ptr < end_pos) {
1801 unsigned char c = *utf_ptr++;
1803 if (c < 0x20) return false; /* disallow control characters */
1804 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1811 bool is_valid_name_utf(utf *u)
1813 return is_valid_name(u->text, UTF_END(u));
1817 /* utf_show ********************************************************************
1819 Writes the utf symbols in the utfhash to stdout and displays the
1820 number of external hash chains grouped according to the chainlength
1821 (for debugging purposes).
1823 *******************************************************************************/
1825 #if !defined(NDEBUG)
1829 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1831 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1832 u4 max_chainlength = 0; /* maximum length of the chains */
1833 u4 sum_chainlength = 0; /* sum of the chainlengths */
1834 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1837 printf("UTF-HASH:\n");
1839 /* show element of utf-hashtable */
1841 for (i = 0; i < hashtable_utf->size; i++) {
1842 utf *u = hashtable_utf->ptr[i];
1845 printf("SLOT %d: ", (int) i);
1849 utf_display_printable_ascii(u);
1857 printf("UTF-HASH: %d slots for %d entries\n",
1858 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1860 if (hashtable_utf->entries == 0)
1863 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1865 for (i=0;i<CHAIN_LIMIT;i++)
1868 /* count numbers of hashchains according to their length */
1869 for (i=0; i<hashtable_utf->size; i++) {
1871 utf *u = (utf*) hashtable_utf->ptr[i];
1872 u4 chain_length = 0;
1874 /* determine chainlength */
1880 /* update sum of all chainlengths */
1881 sum_chainlength+=chain_length;
1883 /* determine the maximum length of the chains */
1884 if (chain_length>max_chainlength)
1885 max_chainlength = chain_length;
1887 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1888 if (chain_length>=CHAIN_LIMIT) {
1889 beyond_limit+=chain_length;
1890 chain_length=CHAIN_LIMIT-1;
1893 /* update number of hashchains of current length */
1894 chain_count[chain_length]++;
1897 /* display results */
1898 for (i=1;i<CHAIN_LIMIT-1;i++)
1899 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1901 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1904 printf("max. chainlength:%5d\n",max_chainlength);
1906 /* avg. chainlength = sum of chainlengths / number of chains */
1907 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1909 #endif /* !defined(NDEBUG) */
1913 * These are local overrides for various environment variables in Emacs.
1914 * Please do not remove this and leave it at the end of the file, where
1915 * Emacs will automagically detect them.
1916 * ---------------------------------------------------------------------
1919 * indent-tabs-mode: t
1923 * vim:noexpandtab:sw=4:ts=4: