1 /* src/vmcore/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007, 2008
4 CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
6 This file is part of CACAO.
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License as
10 published by the Free Software Foundation; either version 2, or (at
11 your option) any later version.
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "mm/memory.h"
35 #include "threads/lock-common.h"
37 #include "toolbox/hashtable.h"
39 #include "vm/exceptions.h"
41 #include "vmcore/options.h"
43 #if defined(ENABLE_STATISTICS)
44 # include "vmcore/statistics.h"
47 #include "vmcore/utf8.h"
50 /* global variables ***********************************************************/
52 /* hashsize must be power of 2 */
54 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
56 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
59 /* utf-symbols for pointer comparison of frequently used strings **************/
61 utf *utf_java_lang_Object;
63 utf *utf_java_lang_Class;
64 utf *utf_java_lang_ClassLoader;
65 utf *utf_java_lang_Cloneable;
66 utf *utf_java_lang_SecurityManager;
67 utf *utf_java_lang_String;
68 utf *utf_java_lang_ThreadGroup;
69 utf *utf_java_lang_ref_SoftReference;
70 utf *utf_java_lang_ref_WeakReference;
71 utf *utf_java_lang_ref_PhantomReference;
72 utf *utf_java_io_Serializable;
74 utf *utf_java_lang_Throwable;
75 utf *utf_java_lang_Error;
77 utf *utf_java_lang_AbstractMethodError;
78 utf *utf_java_lang_ClassCircularityError;
79 utf *utf_java_lang_ClassFormatError;
80 utf *utf_java_lang_ExceptionInInitializerError;
81 utf *utf_java_lang_IncompatibleClassChangeError;
82 utf *utf_java_lang_InstantiationError;
83 utf *utf_java_lang_InternalError;
84 utf *utf_java_lang_LinkageError;
85 utf *utf_java_lang_NoClassDefFoundError;
86 utf *utf_java_lang_NoSuchFieldError;
87 utf *utf_java_lang_NoSuchMethodError;
88 utf *utf_java_lang_OutOfMemoryError;
89 utf *utf_java_lang_UnsatisfiedLinkError;
90 utf *utf_java_lang_UnsupportedClassVersionError;
91 utf *utf_java_lang_VerifyError;
92 utf *utf_java_lang_VirtualMachineError;
94 utf *utf_java_lang_Exception;
96 utf *utf_java_lang_ArithmeticException;
97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
98 utf *utf_java_lang_ArrayStoreException;
99 utf *utf_java_lang_ClassCastException;
100 utf *utf_java_lang_ClassNotFoundException;
101 utf *utf_java_lang_CloneNotSupportedException;
102 utf *utf_java_lang_IllegalAccessException;
103 utf *utf_java_lang_IllegalArgumentException;
104 utf *utf_java_lang_IllegalMonitorStateException;
105 utf *utf_java_lang_InstantiationException;
106 utf *utf_java_lang_InterruptedException;
107 utf *utf_java_lang_NegativeArraySizeException;
108 utf *utf_java_lang_NullPointerException;
109 utf *utf_java_lang_RuntimeException;
110 utf *utf_java_lang_StringIndexOutOfBoundsException;
112 utf *utf_java_lang_reflect_InvocationTargetException;
114 utf *utf_java_security_PrivilegedActionException;
116 #if defined(ENABLE_JAVASE)
117 utf* utf_java_lang_Void;
120 utf* utf_java_lang_Boolean;
121 utf* utf_java_lang_Byte;
122 utf* utf_java_lang_Character;
123 utf* utf_java_lang_Short;
124 utf* utf_java_lang_Integer;
125 utf* utf_java_lang_Long;
126 utf* utf_java_lang_Float;
127 utf* utf_java_lang_Double;
129 #if defined(ENABLE_JAVASE)
130 utf *utf_java_lang_StackTraceElement;
131 utf *utf_java_lang_reflect_Constructor;
132 utf *utf_java_lang_reflect_Field;
133 utf *utf_java_lang_reflect_Method;
134 utf *utf_java_util_Vector;
137 utf *utf_InnerClasses; /* InnerClasses */
138 utf *utf_ConstantValue; /* ConstantValue */
139 utf *utf_Code; /* Code */
140 utf *utf_Exceptions; /* Exceptions */
141 utf *utf_LineNumberTable; /* LineNumberTable */
142 utf *utf_SourceFile; /* SourceFile */
144 #if defined(ENABLE_JAVASE)
145 utf *utf_EnclosingMethod;
147 utf *utf_StackMapTable;
149 #if defined(ENABLE_ANNOTATIONS)
150 utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */
151 utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */
152 utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */
153 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
154 utf *utf_AnnotationDefault; /* AnnotationDefault */
158 utf *utf_init; /* <init> */
159 utf *utf_clinit; /* <clinit> */
160 utf *utf_clone; /* clone */
161 utf *utf_finalize; /* finalize */
164 utf *utf_run; /* run */
169 utf *utf_removeThread;
172 utf *utf_uncaughtException;
175 utf *utf_fillInStackTrace;
177 utf *utf_getSystemClassLoader;
180 utf *utf_loadClassInternal;
181 utf *utf_printStackTrace;
183 utf *utf_division_by_zero;
194 utf *utf_void__void; /* ()V */
195 utf *utf_boolean__void; /* (Z)V */
196 utf *utf_byte__void; /* (B)V */
197 utf *utf_char__void; /* (C)V */
198 utf *utf_short__void; /* (S)V */
199 utf *utf_int__void; /* (I)V */
200 utf *utf_long__void; /* (J)V */
201 utf *utf_float__void; /* (F)V */
202 utf *utf_double__void; /* (D)V */
204 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
205 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
206 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
207 utf *utf_java_lang_ClassLoader_java_lang_String__J;
208 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
209 utf *utf_java_lang_Object__java_lang_Object;
210 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
211 utf *utf_java_lang_String__java_lang_Class;
212 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
213 utf *utf_java_lang_Thread_java_lang_Throwable__V;
214 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
215 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
216 utf *utf_java_lang_Throwable__java_lang_Throwable;
218 utf *utf_not_named_yet; /* special name for unnamed classes */
220 utf *array_packagename;
223 /* utf_init ********************************************************************
225 Initializes the utf8 subsystem.
227 *******************************************************************************/
231 TRACESUBSYSTEMINITIALIZATION("utf8_init");
233 /* create utf8 hashtable */
235 hashtable_utf = NEW(hashtable);
237 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
239 #if defined(ENABLE_STATISTICS)
241 count_utf_len += sizeof(utf*) * hashtable_utf->size;
244 /* create utf-symbols for pointer comparison of frequently used strings */
246 utf_java_lang_Object = utf_new_char("java/lang/Object");
248 utf_java_lang_Class = utf_new_char("java/lang/Class");
249 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
250 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
251 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
252 utf_java_lang_String = utf_new_char("java/lang/String");
253 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
255 utf_java_lang_ref_SoftReference =
256 utf_new_char("java/lang/ref/SoftReference");
258 utf_java_lang_ref_WeakReference =
259 utf_new_char("java/lang/ref/WeakReference");
261 utf_java_lang_ref_PhantomReference =
262 utf_new_char("java/lang/ref/PhantomReference");
264 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
266 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
267 utf_java_lang_Error = utf_new_char("java/lang/Error");
269 utf_java_lang_ClassCircularityError =
270 utf_new_char("java/lang/ClassCircularityError");
272 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
274 utf_java_lang_ExceptionInInitializerError =
275 utf_new_char("java/lang/ExceptionInInitializerError");
277 utf_java_lang_IncompatibleClassChangeError =
278 utf_new_char("java/lang/IncompatibleClassChangeError");
280 utf_java_lang_InstantiationError =
281 utf_new_char("java/lang/InstantiationError");
283 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
284 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
286 utf_java_lang_NoClassDefFoundError =
287 utf_new_char("java/lang/NoClassDefFoundError");
289 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
291 utf_java_lang_UnsatisfiedLinkError =
292 utf_new_char("java/lang/UnsatisfiedLinkError");
294 utf_java_lang_UnsupportedClassVersionError =
295 utf_new_char("java/lang/UnsupportedClassVersionError");
297 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
299 utf_java_lang_VirtualMachineError =
300 utf_new_char("java/lang/VirtualMachineError");
302 #if defined(ENABLE_JAVASE)
303 utf_java_lang_AbstractMethodError =
304 utf_new_char("java/lang/AbstractMethodError");
306 utf_java_lang_NoSuchFieldError =
307 utf_new_char("java/lang/NoSuchFieldError");
309 utf_java_lang_NoSuchMethodError =
310 utf_new_char("java/lang/NoSuchMethodError");
313 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
315 utf_java_lang_ArithmeticException =
316 utf_new_char("java/lang/ArithmeticException");
318 utf_java_lang_ArrayIndexOutOfBoundsException =
319 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
321 utf_java_lang_ArrayStoreException =
322 utf_new_char("java/lang/ArrayStoreException");
324 utf_java_lang_ClassCastException =
325 utf_new_char("java/lang/ClassCastException");
327 utf_java_lang_ClassNotFoundException =
328 utf_new_char("java/lang/ClassNotFoundException");
330 utf_java_lang_CloneNotSupportedException =
331 utf_new_char("java/lang/CloneNotSupportedException");
333 utf_java_lang_IllegalAccessException =
334 utf_new_char("java/lang/IllegalAccessException");
336 utf_java_lang_IllegalArgumentException =
337 utf_new_char("java/lang/IllegalArgumentException");
339 utf_java_lang_IllegalMonitorStateException =
340 utf_new_char("java/lang/IllegalMonitorStateException");
342 utf_java_lang_InstantiationException =
343 utf_new_char("java/lang/InstantiationException");
345 utf_java_lang_InterruptedException =
346 utf_new_char("java/lang/InterruptedException");
348 utf_java_lang_NegativeArraySizeException =
349 utf_new_char("java/lang/NegativeArraySizeException");
351 utf_java_lang_NullPointerException =
352 utf_new_char("java/lang/NullPointerException");
354 utf_java_lang_RuntimeException =
355 utf_new_char("java/lang/RuntimeException");
357 utf_java_lang_StringIndexOutOfBoundsException =
358 utf_new_char("java/lang/StringIndexOutOfBoundsException");
360 utf_java_lang_reflect_InvocationTargetException =
361 utf_new_char("java/lang/reflect/InvocationTargetException");
363 utf_java_security_PrivilegedActionException =
364 utf_new_char("java/security/PrivilegedActionException");
366 #if defined(ENABLE_JAVASE)
367 utf_java_lang_Void = utf_new_char("java/lang/Void");
370 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
371 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
372 utf_java_lang_Character = utf_new_char("java/lang/Character");
373 utf_java_lang_Short = utf_new_char("java/lang/Short");
374 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
375 utf_java_lang_Long = utf_new_char("java/lang/Long");
376 utf_java_lang_Float = utf_new_char("java/lang/Float");
377 utf_java_lang_Double = utf_new_char("java/lang/Double");
379 #if defined(ENABLE_JAVASE)
380 utf_java_lang_StackTraceElement =
381 utf_new_char("java/lang/StackTraceElement");
383 utf_java_lang_reflect_Constructor =
384 utf_new_char("java/lang/reflect/Constructor");
386 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
387 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
388 utf_java_util_Vector = utf_new_char("java/util/Vector");
391 utf_InnerClasses = utf_new_char("InnerClasses");
392 utf_ConstantValue = utf_new_char("ConstantValue");
393 utf_Code = utf_new_char("Code");
394 utf_Exceptions = utf_new_char("Exceptions");
395 utf_LineNumberTable = utf_new_char("LineNumberTable");
396 utf_SourceFile = utf_new_char("SourceFile");
398 #if defined(ENABLE_JAVASE)
399 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
400 utf_Signature = utf_new_char("Signature");
401 utf_StackMapTable = utf_new_char("StackMapTable");
403 #if defined(ENABLE_ANNOTATIONS)
404 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
405 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
406 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
407 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
408 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
412 utf_init = utf_new_char("<init>");
413 utf_clinit = utf_new_char("<clinit>");
414 utf_clone = utf_new_char("clone");
415 utf_finalize = utf_new_char("finalize");
416 utf_invoke = utf_new_char("invoke");
417 utf_main = utf_new_char("main");
418 utf_run = utf_new_char("run");
420 utf_add = utf_new_char("add");
421 utf_remove = utf_new_char("remove");
422 utf_addThread = utf_new_char("addThread");
423 utf_removeThread = utf_new_char("removeThread");
424 utf_put = utf_new_char("put");
425 utf_get = utf_new_char("get");
426 utf_uncaughtException = utf_new_char("uncaughtException");
427 utf_value = utf_new_char("value");
429 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
430 utf_findNative = utf_new_char("findNative");
431 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
432 utf_initCause = utf_new_char("initCause");
433 utf_loadClass = utf_new_char("loadClass");
434 utf_loadClassInternal = utf_new_char("loadClassInternal");
435 utf_printStackTrace = utf_new_char("printStackTrace");
437 utf_division_by_zero = utf_new_char("/ by zero");
439 utf_Z = utf_new_char("Z");
440 utf_B = utf_new_char("B");
441 utf_C = utf_new_char("C");
442 utf_S = utf_new_char("S");
443 utf_I = utf_new_char("I");
444 utf_J = utf_new_char("J");
445 utf_F = utf_new_char("F");
446 utf_D = utf_new_char("D");
448 utf_void__void = utf_new_char("()V");
449 utf_boolean__void = utf_new_char("(Z)V");
450 utf_byte__void = utf_new_char("(B)V");
451 utf_char__void = utf_new_char("(C)V");
452 utf_short__void = utf_new_char("(S)V");
453 utf_int__void = utf_new_char("(I)V");
454 utf_long__void = utf_new_char("(J)V");
455 utf_float__void = utf_new_char("(F)V");
456 utf_double__void = utf_new_char("(D)V");
457 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
458 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
460 utf_void__java_lang_ClassLoader =
461 utf_new_char("()Ljava/lang/ClassLoader;");
463 utf_java_lang_ClassLoader_java_lang_String__J =
464 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
466 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
468 utf_java_lang_Object__java_lang_Object =
469 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
471 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
473 utf_java_lang_String__java_lang_Class =
474 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
476 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
478 utf_java_lang_Thread_java_lang_Throwable__V =
479 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
481 utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
482 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
484 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
486 utf_java_lang_Throwable__java_lang_Throwable =
487 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
489 utf_null = utf_new_char("null");
490 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
491 array_packagename = utf_new_char("\t<the array package>");
495 /* utf_hashkey *****************************************************************
497 The hashkey is computed from the utf-text by using up to 8
498 characters. For utf-symbols longer than 15 characters 3 characters
499 are taken from the beginning and the end, 2 characters are taken
502 *******************************************************************************/
504 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
505 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
507 u4 utf_hashkey(const char *text, u4 length)
509 const char *start_pos = text; /* pointer to utf text */
513 case 0: /* empty string */
516 case 1: return fbs(0);
517 case 2: return fbs(0) ^ nbs(3);
518 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
519 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
520 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
521 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
522 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
523 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
530 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
539 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
548 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
560 return a ^ nbs(9) ^ nbs(10);
572 return a ^ nbs(9) ^ nbs(10);
583 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
594 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
596 default: /* 3 characters from beginning */
602 /* 2 characters from middle */
603 text = start_pos + (length / 2);
608 /* 3 characters from end */
609 text = start_pos + length - 4;
614 return a ^ nbs(10) ^ nbs(11);
618 /* utf_full_hashkey ************************************************************
620 This function computes a hash value using all bytes in the string.
622 The algorithm is the "One-at-a-time" algorithm as published
623 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
625 *******************************************************************************/
627 u4 utf_full_hashkey(const char *text, u4 length)
629 register const unsigned char *p = (const unsigned char *) text;
637 hash += (hash << 10);
641 hash ^= (hash >> 11);
642 hash += (hash << 15);
647 /* unicode_hashkey *************************************************************
649 Compute the hashkey of a unicode string.
651 *******************************************************************************/
653 u4 unicode_hashkey(u2 *text, u2 len)
655 return utf_hashkey((char *) text, len);
659 /* utf_new *********************************************************************
661 Creates a new utf-symbol, the text of the symbol is passed as a
662 u1-array. The function searches the utf-hashtable for a utf-symbol
663 with this text. On success the element returned, otherwise a new
664 hashtable element is created.
666 If the number of entries in the hashtable exceeds twice the size of
667 the hashtable slots a reorganization of the hashtable is done and
668 the utf symbols are copied to a new hashtable with doubled size.
670 *******************************************************************************/
672 utf *utf_new(const char *text, u2 length)
674 u4 key; /* hashkey computed from utf-text */
675 u4 slot; /* slot in hashtable */
676 utf *u; /* hashtable element */
679 LOCK_MONITOR_ENTER(hashtable_utf->header);
681 #if defined(ENABLE_STATISTICS)
686 key = utf_hashkey(text, length);
687 slot = key & (hashtable_utf->size - 1);
688 u = hashtable_utf->ptr[slot];
690 /* search external hash chain for utf-symbol */
693 if (u->blength == length) {
694 /* compare text of hashtable elements */
696 for (i = 0; i < length; i++)
697 if (text[i] != u->text[i])
700 #if defined(ENABLE_STATISTICS)
702 count_utf_new_found++;
705 /* symbol found in hashtable */
707 LOCK_MONITOR_EXIT(hashtable_utf->header);
713 u = u->hashlink; /* next element in external chain */
716 /* location in hashtable found, create new utf element */
720 u->blength = length; /* length in bytes of utfstring */
721 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
722 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
724 memcpy(u->text, text, length); /* copy utf-text */
725 u->text[length] = '\0';
727 #if defined(ENABLE_STATISTICS)
729 count_utf_len += sizeof(utf) + length + 1;
732 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
733 hashtable_utf->entries++; /* update number of entries */
735 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
737 /* reorganization of hashtable, average length of the external
738 chains is approx. 2 */
740 hashtable *newhash; /* the new hashtable */
746 /* create new hashtable, double the size */
748 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
750 #if defined(ENABLE_STATISTICS)
752 count_utf_len += sizeof(utf*) * hashtable_utf->size;
755 /* transfer elements to new hashtable */
757 for (i = 0; i < hashtable_utf->size; i++) {
758 u = hashtable_utf->ptr[i];
762 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
764 u->hashlink = (utf *) newhash->ptr[slot];
765 newhash->ptr[slot] = u;
767 /* follow link in external hash chain */
773 /* dispose old table */
775 hashtable_free(hashtable_utf);
777 hashtable_utf = newhash;
780 LOCK_MONITOR_EXIT(hashtable_utf->header);
786 /* utf_new_u2 ******************************************************************
788 Make utf symbol from u2 array, if isclassname is true '.' is
791 *******************************************************************************/
793 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
795 char *buffer; /* memory buffer for unicode characters */
796 char *pos; /* pointer to current position in buffer */
797 u4 left; /* unicode characters left */
798 u4 buflength; /* utf length in bytes of the u2 array */
799 utf *result; /* resulting utf-string */
802 /* determine utf length in bytes and allocate memory */
804 buflength = u2_utflength(unicode_pos, unicode_length);
805 buffer = MNEW(char, buflength);
810 for (i = 0; i++ < unicode_length; unicode_pos++) {
811 /* next unicode character */
814 if ((c != 0) && (c < 0x80)) {
817 if ((int) left < 0) break;
818 /* convert classname */
819 if (isclassname && c == '.')
824 } else if (c < 0x800) {
826 unsigned char high = c >> 6;
827 unsigned char low = c & 0x3F;
829 if ((int) left < 0) break;
830 *pos++ = high | 0xC0;
836 char mid = (c >> 6) & 0x3F;
839 if ((int) left < 0) break;
840 *pos++ = high | 0xE0;
846 /* insert utf-string into symbol-table */
847 result = utf_new(buffer,buflength);
849 MFREE(buffer, char, buflength);
855 /* utf_new_char ****************************************************************
857 Creates a new utf symbol, the text for this symbol is passed as a
858 c-string ( = char* ).
860 *******************************************************************************/
862 utf *utf_new_char(const char *text)
864 return utf_new(text, strlen(text));
868 /* utf_new_char_classname ******************************************************
870 Creates a new utf symbol, the text for this symbol is passed as a
871 c-string ( = char* ) "." characters are going to be replaced by
872 "/". Since the above function is used often, this is a separte
873 function, instead of an if.
875 *******************************************************************************/
877 utf *utf_new_char_classname(const char *text)
879 if (strchr(text, '.')) {
880 char *txt = strdup(text);
881 char *end = txt + strlen(txt);
885 for (c = txt; c < end; c++)
886 if (*c == '.') *c = '/';
888 tmpRes = utf_new(txt, strlen(txt));
894 return utf_new(text, strlen(text));
898 /* utf_nextu2 ******************************************************************
900 Read the next unicode character from the utf string and increment
901 the utf-string pointer accordingly.
903 CAUTION: This function is unsafe for input that was not checked
906 *******************************************************************************/
908 u2 utf_nextu2(char **utf_ptr)
910 /* uncompressed unicode character */
912 /* current position in utf text */
913 unsigned char *utf = (unsigned char *) (*utf_ptr);
914 /* bytes representing the unicode character */
915 unsigned char ch1, ch2, ch3;
916 /* number of bytes used to represent the unicode character */
919 switch ((ch1 = utf[0]) >> 4) {
920 default: /* 1 byte */
924 case 0xD: /* 2 bytes */
925 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
926 unsigned char high = ch1 & 0x1F;
927 unsigned char low = ch2 & 0x3F;
928 unicode_char = (high << 6) + low;
933 case 0xE: /* 2 or 3 bytes */
934 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
935 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
936 unsigned char low = ch3 & 0x3f;
937 unsigned char mid = ch2 & 0x3f;
938 unsigned char high = ch1 & 0x0f;
939 unicode_char = (((high << 6) + mid) << 6) + low;
947 /* update position in utf-text */
948 *utf_ptr = (char *) (utf + len);
954 /* utf_bytes *******************************************************************
956 Determine number of bytes (aka. octets) in the utf string.
959 u............utf string
962 The number of octets of this utf string.
963 There is _no_ terminating zero included in this count.
965 *******************************************************************************/
973 /* utf_get_number_of_u2s_for_buffer ********************************************
975 Determine number of UTF-16 u2s in the given UTF-8 buffer
977 CAUTION: This function is unsafe for input that was not checked
980 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
981 to an array of u2s (UTF-16) and want to know how many of them you will get.
982 All other uses of this function are probably wrong.
985 buffer........points to first char in buffer
986 blength.......number of _bytes_ in the buffer
989 the number of u2s needed to hold this string in UTF-16 encoding.
990 There is _no_ terminating zero included in this count.
992 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
995 *******************************************************************************/
997 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
999 const char *endpos; /* points behind utf string */
1000 const char *utf_ptr; /* current position in utf text */
1001 u4 len = 0; /* number of unicode characters */
1004 endpos = utf_ptr + blength;
1006 while (utf_ptr < endpos) {
1008 /* next unicode character */
1009 utf_nextu2((char **)&utf_ptr);
1012 assert(utf_ptr == endpos);
1018 /* utf_get_number_of_u2s *******************************************************
1020 Determine number of UTF-16 u2s in the utf string.
1022 CAUTION: This function is unsafe for input that was not checked
1025 CAUTION: Use this function *only* when you want to convert a utf string
1026 to an array of u2s and want to know how many of them you will get.
1027 All other uses of this function are probably wrong.
1030 u............utf string
1033 the number of u2s needed to hold this string in UTF-16 encoding.
1034 There is _no_ terminating zero included in this count.
1035 XXX 0 if a NullPointerException has been thrown (see below)
1037 *******************************************************************************/
1039 u4 utf_get_number_of_u2s(utf *u)
1041 char *endpos; /* points behind utf string */
1042 char *utf_ptr; /* current position in utf text */
1043 u4 len = 0; /* number of unicode characters */
1045 /* XXX this is probably not checked by most callers! Review this after */
1046 /* the invalid uses of this function have been eliminated */
1048 exceptions_throw_nullpointerexception();
1052 endpos = UTF_END(u);
1055 while (utf_ptr < endpos) {
1057 /* next unicode character */
1058 utf_nextu2(&utf_ptr);
1061 if (utf_ptr != endpos) {
1062 /* string ended abruptly */
1063 exceptions_throw_internalerror("Illegal utf8 string");
1071 /* utf8_safe_number_of_u2s *****************************************************
1073 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1074 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1076 This function is safe even for invalid UTF-8 strings.
1079 text..........zero-terminated(!) UTF-8 string (may be invalid)
1081 nbytes........strlen(text). (This is needed to completely emulate
1085 the number of u2s needed to hold this string in UTF-16 encoding.
1086 There is _no_ terminating zero included in this count.
1088 *******************************************************************************/
1090 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1091 register const unsigned char *t;
1094 register const unsigned char *tlimit;
1102 assert(nbytes >= 0);
1105 t = (const unsigned char *) text;
1106 tlimit = t + nbytes;
1108 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1114 /* highest bit set, non-ASCII character */
1116 if ((byte & 0xe0) == 0xc0) {
1117 /* 2-byte: should be 110..... 10...... ? */
1119 if ((*t++ & 0xc0) == 0x80)
1120 ; /* valid 2-byte */
1124 else if ((byte & 0xf0) == 0xe0) {
1125 /* 3-byte: should be 1110.... 10...... 10...... */
1129 return len + 1; /* invalid, stop here */
1131 if ((*t++ & 0xc0) == 0x80) {
1132 if ((*t++ & 0xc0) == 0x80)
1133 ; /* valid 3-byte */
1140 else if ((byte & 0xf8) == 0xf0) {
1141 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1145 return len + 1; /* invalid, stop here */
1147 if (((byte1 = *t++) & 0xc0) == 0x80) {
1148 if (((byte2 = *t++) & 0xc0) == 0x80) {
1149 if (((byte3 = *t++) & 0xc0) == 0x80) {
1150 /* valid 4-byte UTF-8? */
1151 value = ((byte & 0x07) << 18)
1152 | ((byte1 & 0x3f) << 12)
1153 | ((byte2 & 0x3f) << 6)
1154 | ((byte3 & 0x3f) );
1156 if (value > 0x10FFFF)
1158 else if (value > 0xFFFF)
1159 len += 1; /* we need surrogates */
1161 ; /* 16bit suffice */
1172 else if ((byte & 0xfc) == 0xf8) {
1173 /* invalid 5-byte */
1175 return len + 1; /* invalid, stop here */
1178 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1181 else if ((byte & 0xfe) == 0xfc) {
1182 /* invalid 6-byte */
1184 return len + 1; /* invalid, stop here */
1187 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1199 /* ASCII character, common case */
1209 /* utf8_safe_convert_to_u2s ****************************************************
1211 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1212 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1213 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1215 This function is safe even for invalid UTF-8 strings.
1218 text..........zero-terminated(!) UTF-8 string (may be invalid)
1220 nbytes........strlen(text). (This is needed to completely emulate
1222 buffer........a preallocated array of u2s to receive the decoded
1223 string. Use utf8_safe_number_of_u2s to get the
1224 required number of u2s for allocating this.
1226 *******************************************************************************/
1228 #define UNICODE_REPLACEMENT 0xfffd
1230 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1231 register const unsigned char *t;
1233 register const unsigned char *tlimit;
1241 assert(nbytes >= 0);
1243 t = (const unsigned char *) text;
1244 tlimit = t + nbytes;
1246 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1252 /* highest bit set, non-ASCII character */
1254 if ((byte & 0xe0) == 0xc0) {
1255 /* 2-byte: should be 110..... 10...... */
1257 if (((byte1 = *t++) & 0xc0) == 0x80) {
1258 /* valid 2-byte UTF-8 */
1259 *buffer++ = ((byte & 0x1f) << 6)
1260 | ((byte1 & 0x3f) );
1263 *buffer++ = UNICODE_REPLACEMENT;
1267 else if ((byte & 0xf0) == 0xe0) {
1268 /* 3-byte: should be 1110.... 10...... 10...... */
1270 if (t + 2 > tlimit) {
1271 *buffer++ = UNICODE_REPLACEMENT;
1275 if (((byte1 = *t++) & 0xc0) == 0x80) {
1276 if (((byte2 = *t++) & 0xc0) == 0x80) {
1277 /* valid 3-byte UTF-8 */
1278 *buffer++ = ((byte & 0x0f) << 12)
1279 | ((byte1 & 0x3f) << 6)
1280 | ((byte2 & 0x3f) );
1283 *buffer++ = UNICODE_REPLACEMENT;
1288 *buffer++ = UNICODE_REPLACEMENT;
1292 else if ((byte & 0xf8) == 0xf0) {
1293 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1295 if (t + 3 > tlimit) {
1296 *buffer++ = UNICODE_REPLACEMENT;
1300 if (((byte1 = *t++) & 0xc0) == 0x80) {
1301 if (((byte2 = *t++) & 0xc0) == 0x80) {
1302 if (((byte3 = *t++) & 0xc0) == 0x80) {
1303 /* valid 4-byte UTF-8? */
1304 value = ((byte & 0x07) << 18)
1305 | ((byte1 & 0x3f) << 12)
1306 | ((byte2 & 0x3f) << 6)
1307 | ((byte3 & 0x3f) );
1309 if (value > 0x10FFFF) {
1310 *buffer++ = UNICODE_REPLACEMENT;
1312 else if (value > 0xFFFF) {
1313 /* we need surrogates */
1314 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1315 *buffer++ = 0xdc00 | (value & 0x03ff);
1318 *buffer++ = value; /* 16bit suffice */
1321 *buffer++ = UNICODE_REPLACEMENT;
1326 *buffer++ = UNICODE_REPLACEMENT;
1331 *buffer++ = UNICODE_REPLACEMENT;
1335 else if ((byte & 0xfc) == 0xf8) {
1336 if (t + 4 > tlimit) {
1337 *buffer++ = UNICODE_REPLACEMENT;
1342 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1344 *buffer++ = UNICODE_REPLACEMENT;
1346 else if ((byte & 0xfe) == 0xfc) {
1347 if (t + 5 > tlimit) {
1348 *buffer++ = UNICODE_REPLACEMENT;
1353 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1355 *buffer++ = UNICODE_REPLACEMENT;
1358 *buffer++ = UNICODE_REPLACEMENT;
1366 /* ASCII character, common case */
1374 /* u2_utflength ****************************************************************
1376 Returns the utf length in bytes of a u2 array.
1378 *******************************************************************************/
1380 u4 u2_utflength(u2 *text, u4 u2_length)
1382 u4 result_len = 0; /* utf length in bytes */
1383 u2 ch; /* current unicode character */
1386 for (len = 0; len < u2_length; len++) {
1387 /* next unicode character */
1390 /* determine bytes required to store unicode character as utf */
1391 if (ch && (ch < 0x80))
1393 else if (ch < 0x800)
1403 /* utf_copy ********************************************************************
1405 Copy the given utf string byte-for-byte to a buffer.
1408 buffer.......the buffer
1409 u............the utf string
1411 *******************************************************************************/
1413 void utf_copy(char *buffer, utf *u)
1415 /* our utf strings are zero-terminated (done by utf_new) */
1416 MCOPY(buffer, u->text, char, u->blength + 1);
1420 /* utf_cat *********************************************************************
1422 Append the given utf string byte-for-byte to a buffer.
1425 buffer.......the buffer
1426 u............the utf string
1428 *******************************************************************************/
1430 void utf_cat(char *buffer, utf *u)
1432 /* our utf strings are zero-terminated (done by utf_new) */
1433 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1437 /* utf_copy_classname **********************************************************
1439 Copy the given utf classname byte-for-byte to a buffer.
1440 '/' is replaced by '.'
1443 buffer.......the buffer
1444 u............the utf string
1446 *******************************************************************************/
1448 void utf_copy_classname(char *buffer, utf *u)
1457 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1459 while (srcptr != endptr) {
1468 /* utf_cat *********************************************************************
1470 Append the given utf classname byte-for-byte to a buffer.
1471 '/' is replaced by '.'
1474 buffer.......the buffer
1475 u............the utf string
1477 *******************************************************************************/
1479 void utf_cat_classname(char *buffer, utf *u)
1481 utf_copy_classname(buffer + strlen(buffer), u);
1484 /* utf_display_printable_ascii *************************************************
1486 Write utf symbol to stdout (for debugging purposes).
1487 Non-printable and non-ASCII characters are printed as '?'.
1489 *******************************************************************************/
1491 void utf_display_printable_ascii(utf *u)
1493 char *endpos; /* points behind utf string */
1494 char *utf_ptr; /* current position in utf text */
1502 endpos = UTF_END(u);
1505 while (utf_ptr < endpos) {
1506 /* read next unicode character */
1508 u2 c = utf_nextu2(&utf_ptr);
1510 if ((c >= 32) && (c <= 127))
1520 /* utf_display_printable_ascii_classname ***************************************
1522 Write utf symbol to stdout with `/' converted to `.' (for debugging
1524 Non-printable and non-ASCII characters are printed as '?'.
1526 *******************************************************************************/
1528 void utf_display_printable_ascii_classname(utf *u)
1530 char *endpos; /* points behind utf string */
1531 char *utf_ptr; /* current position in utf text */
1539 endpos = UTF_END(u);
1542 while (utf_ptr < endpos) {
1543 /* read next unicode character */
1545 u2 c = utf_nextu2(&utf_ptr);
1550 if ((c >= 32) && (c <= 127))
1560 /* utf_sprint_convert_to_latin1 ************************************************
1562 Write utf symbol into c-string (for debugging purposes).
1563 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1566 *******************************************************************************/
1568 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1570 char *endpos; /* points behind utf string */
1571 char *utf_ptr; /* current position in utf text */
1572 u2 pos = 0; /* position in c-string */
1575 strcpy(buffer, "NULL");
1579 endpos = UTF_END(u);
1582 while (utf_ptr < endpos)
1583 /* copy next unicode character */
1584 buffer[pos++] = utf_nextu2(&utf_ptr);
1586 /* terminate string */
1591 /* utf_sprint_convert_to_latin1_classname **************************************
1593 Write utf symbol into c-string with `/' converted to `.' (for debugging
1595 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1598 *******************************************************************************/
1600 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1602 char *endpos; /* points behind utf string */
1603 char *utf_ptr; /* current position in utf text */
1604 u2 pos = 0; /* position in c-string */
1607 strcpy(buffer, "NULL");
1611 endpos = UTF_END(u);
1614 while (utf_ptr < endpos) {
1615 /* copy next unicode character */
1616 u2 c = utf_nextu2(&utf_ptr);
1617 if (c == '/') c = '.';
1621 /* terminate string */
1626 /* utf_strcat_convert_to_latin1 ************************************************
1628 Like libc strcat, but uses an utf8 string.
1629 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1632 *******************************************************************************/
1634 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1636 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1640 /* utf_strcat_convert_to_latin1_classname **************************************
1642 Like libc strcat, but uses an utf8 string.
1643 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1646 *******************************************************************************/
1648 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1650 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1654 /* utf_fprint_printable_ascii **************************************************
1656 Write utf symbol into file.
1657 Non-printable and non-ASCII characters are printed as '?'.
1659 *******************************************************************************/
1661 void utf_fprint_printable_ascii(FILE *file, utf *u)
1663 char *endpos; /* points behind utf string */
1664 char *utf_ptr; /* current position in utf text */
1669 endpos = UTF_END(u);
1672 while (utf_ptr < endpos) {
1673 /* read next unicode character */
1674 u2 c = utf_nextu2(&utf_ptr);
1676 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1677 else fprintf(file, "?");
1682 /* utf_fprint_printable_ascii_classname ****************************************
1684 Write utf symbol into file with `/' converted to `.'.
1685 Non-printable and non-ASCII characters are printed as '?'.
1687 *******************************************************************************/
1689 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1691 char *endpos; /* points behind utf string */
1692 char *utf_ptr; /* current position in utf text */
1697 endpos = UTF_END(u);
1700 while (utf_ptr < endpos) {
1701 /* read next unicode character */
1702 u2 c = utf_nextu2(&utf_ptr);
1703 if (c == '/') c = '.';
1705 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1706 else fprintf(file, "?");
1711 /* is_valid_utf ****************************************************************
1713 Return true if the given string is a valid UTF-8 string.
1715 utf_ptr...points to first character
1716 end_pos...points after last character
1718 *******************************************************************************/
1720 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1722 bool is_valid_utf(char *utf_ptr, char *end_pos)
1729 if (end_pos < utf_ptr) return false;
1730 bytes = end_pos - utf_ptr;
1734 if (!c) return false; /* 0x00 is not allowed */
1735 if ((c & 0x80) == 0) continue; /* ASCII */
1737 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1738 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1739 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1740 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1741 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1742 else return false; /* invalid leading byte */
1744 if (len > 2) return false; /* Java limitation */
1746 v = (unsigned long)c & (0x3f >> len);
1748 if ((bytes -= len) < 0) return false; /* missing bytes */
1750 for (i = len; i--; ) {
1752 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1754 v = (v << 6) | (c & 0x3f);
1758 if (len != 1) return false; /* Java special */
1761 /* Sun Java seems to allow overlong UTF-8 encodings */
1763 /* if (v < min_codepoint[len]) */
1764 /* XXX throw exception? */
1767 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1768 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1770 /* even these seem to be allowed */
1771 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1778 /* is_valid_name ***************************************************************
1780 Return true if the given string may be used as a class/field/method
1781 name. (Currently this only disallows empty strings and control
1784 NOTE: The string is assumed to have passed is_valid_utf!
1786 utf_ptr...points to first character
1787 end_pos...points after last character
1789 *******************************************************************************/
1791 bool is_valid_name(char *utf_ptr, char *end_pos)
1793 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1795 while (utf_ptr < end_pos) {
1796 unsigned char c = *utf_ptr++;
1798 if (c < 0x20) return false; /* disallow control characters */
1799 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1806 bool is_valid_name_utf(utf *u)
1808 return is_valid_name(u->text, UTF_END(u));
1812 /* utf_show ********************************************************************
1814 Writes the utf symbols in the utfhash to stdout and displays the
1815 number of external hash chains grouped according to the chainlength
1816 (for debugging purposes).
1818 *******************************************************************************/
1820 #if !defined(NDEBUG)
1824 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1826 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1827 u4 max_chainlength = 0; /* maximum length of the chains */
1828 u4 sum_chainlength = 0; /* sum of the chainlengths */
1829 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1832 printf("UTF-HASH:\n");
1834 /* show element of utf-hashtable */
1836 for (i = 0; i < hashtable_utf->size; i++) {
1837 utf *u = hashtable_utf->ptr[i];
1840 printf("SLOT %d: ", (int) i);
1844 utf_display_printable_ascii(u);
1852 printf("UTF-HASH: %d slots for %d entries\n",
1853 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1855 if (hashtable_utf->entries == 0)
1858 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1860 for (i=0;i<CHAIN_LIMIT;i++)
1863 /* count numbers of hashchains according to their length */
1864 for (i=0; i<hashtable_utf->size; i++) {
1866 utf *u = (utf*) hashtable_utf->ptr[i];
1867 u4 chain_length = 0;
1869 /* determine chainlength */
1875 /* update sum of all chainlengths */
1876 sum_chainlength+=chain_length;
1878 /* determine the maximum length of the chains */
1879 if (chain_length>max_chainlength)
1880 max_chainlength = chain_length;
1882 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1883 if (chain_length>=CHAIN_LIMIT) {
1884 beyond_limit+=chain_length;
1885 chain_length=CHAIN_LIMIT-1;
1888 /* update number of hashchains of current length */
1889 chain_count[chain_length]++;
1892 /* display results */
1893 for (i=1;i<CHAIN_LIMIT-1;i++)
1894 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1896 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1899 printf("max. chainlength:%5d\n",max_chainlength);
1901 /* avg. chainlength = sum of chainlengths / number of chains */
1902 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1904 #endif /* !defined(NDEBUG) */
1908 * These are local overrides for various environment variables in Emacs.
1909 * Please do not remove this and leave it at the end of the file, where
1910 * Emacs will automagically detect them.
1911 * ---------------------------------------------------------------------
1914 * indent-tabs-mode: t
1918 * vim:noexpandtab:sw=4:ts=4: