1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006, 2007, 2008
4 CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
6 This file is part of CACAO.
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License as
10 published by the Free Software Foundation; either version 2, or (at
11 your option) any later version.
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "mm/memory.hpp"
35 #include "threads/mutex.hpp"
37 #include "toolbox/hashtable.h"
39 #include "vm/exceptions.hpp"
40 #include "vm/options.h"
42 #if defined(ENABLE_STATISTICS)
43 # include "vm/statistics.h"
49 /* global variables ***********************************************************/
51 /* hashsize must be power of 2 */
53 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
55 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
58 /* utf-symbols for pointer comparison of frequently used strings **************/
60 utf *utf_java_lang_Object;
62 utf *utf_java_lang_Class;
63 utf *utf_java_lang_ClassLoader;
64 utf *utf_java_lang_ClassLoader_NativeLibrary;
65 utf *utf_java_lang_Cloneable;
66 utf *utf_java_lang_SecurityManager;
67 utf *utf_java_lang_String;
68 utf *utf_java_lang_ThreadGroup;
69 utf *utf_java_lang_ref_SoftReference;
70 utf *utf_java_lang_ref_WeakReference;
71 utf *utf_java_lang_ref_PhantomReference;
72 utf *utf_java_io_Serializable;
74 utf *utf_java_lang_Throwable;
75 utf *utf_java_lang_Error;
77 utf *utf_java_lang_AbstractMethodError;
78 utf *utf_java_lang_ClassCircularityError;
79 utf *utf_java_lang_ClassFormatError;
80 utf *utf_java_lang_ExceptionInInitializerError;
81 utf *utf_java_lang_IncompatibleClassChangeError;
82 utf *utf_java_lang_InstantiationError;
83 utf *utf_java_lang_InternalError;
84 utf *utf_java_lang_LinkageError;
85 utf *utf_java_lang_NoClassDefFoundError;
86 utf *utf_java_lang_NoSuchFieldError;
87 utf *utf_java_lang_NoSuchMethodError;
88 utf *utf_java_lang_OutOfMemoryError;
89 utf *utf_java_lang_UnsatisfiedLinkError;
90 utf *utf_java_lang_UnsupportedClassVersionError;
91 utf *utf_java_lang_VerifyError;
92 utf *utf_java_lang_VirtualMachineError;
94 utf *utf_java_lang_Exception;
96 utf *utf_java_lang_ArithmeticException;
97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
98 utf *utf_java_lang_ArrayStoreException;
99 utf *utf_java_lang_ClassCastException;
100 utf *utf_java_lang_ClassNotFoundException;
101 utf *utf_java_lang_CloneNotSupportedException;
102 utf *utf_java_lang_IllegalAccessException;
103 utf *utf_java_lang_IllegalArgumentException;
104 utf *utf_java_lang_IllegalMonitorStateException;
105 utf *utf_java_lang_InstantiationException;
106 utf *utf_java_lang_InterruptedException;
107 utf *utf_java_lang_NegativeArraySizeException;
108 utf *utf_java_lang_NullPointerException;
109 utf *utf_java_lang_RuntimeException;
110 utf *utf_java_lang_StringIndexOutOfBoundsException;
112 utf *utf_java_lang_reflect_InvocationTargetException;
114 utf *utf_java_security_PrivilegedActionException;
116 #if defined(ENABLE_JAVASE)
117 utf* utf_java_lang_Void;
120 utf* utf_java_lang_Boolean;
121 utf* utf_java_lang_Byte;
122 utf* utf_java_lang_Character;
123 utf* utf_java_lang_Short;
124 utf* utf_java_lang_Integer;
125 utf* utf_java_lang_Long;
126 utf* utf_java_lang_Float;
127 utf* utf_java_lang_Double;
129 #if defined(ENABLE_JAVASE)
130 utf *utf_java_lang_StackTraceElement;
131 utf *utf_java_lang_reflect_Constructor;
132 utf *utf_java_lang_reflect_Field;
133 utf *utf_java_lang_reflect_Method;
135 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
136 utf *utf_java_lang_reflect_VMConstructor;
137 utf *utf_java_lang_reflect_VMField;
138 utf *utf_java_lang_reflect_VMMethod;
141 utf *utf_java_util_Vector;
144 utf *utf_InnerClasses; /* InnerClasses */
145 utf *utf_ConstantValue; /* ConstantValue */
146 utf *utf_Code; /* Code */
147 utf *utf_Exceptions; /* Exceptions */
148 utf *utf_LineNumberTable; /* LineNumberTable */
149 utf *utf_SourceFile; /* SourceFile */
151 #if defined(ENABLE_JAVASE)
152 utf *utf_EnclosingMethod;
154 utf *utf_StackMapTable;
156 # if defined(ENABLE_JVMTI)
157 utf *utf_LocalVariableTable;
160 # if defined(ENABLE_ANNOTATIONS)
161 utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */
162 utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */
163 utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */
164 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
165 utf *utf_AnnotationDefault; /* AnnotationDefault */
169 utf *utf_init; /* <init> */
170 utf *utf_clinit; /* <clinit> */
171 utf *utf_clone; /* clone */
172 utf *utf_finalize; /* finalize */
175 utf *utf_run; /* run */
181 utf *utf_removeThread;
184 utf *utf_uncaughtException;
187 utf *utf_fillInStackTrace;
189 utf *utf_getSystemClassLoader;
192 utf *utf_loadClassInternal;
193 utf *utf_printStackTrace;
195 utf *utf_division_by_zero;
206 utf *utf_void__void; /* ()V */
207 utf *utf_boolean__void; /* (Z)V */
208 utf *utf_byte__void; /* (B)V */
209 utf *utf_char__void; /* (C)V */
210 utf *utf_short__void; /* (S)V */
211 utf *utf_int__void; /* (I)V */
212 utf *utf_long__void; /* (J)V */
213 utf *utf_float__void; /* (F)V */
214 utf *utf_double__void; /* (D)V */
216 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
217 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
218 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
219 utf *utf_java_lang_ClassLoader_java_lang_String__J;
220 utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */
221 utf *utf_java_lang_Object__java_lang_Object;
222 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
223 utf *utf_java_lang_String__java_lang_Class;
224 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
225 utf *utf_java_lang_Thread_java_lang_Throwable__V;
226 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
227 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
228 utf *utf_java_lang_Throwable__java_lang_Throwable;
230 utf *utf_not_named_yet; /* special name for unnamed classes */
232 utf *array_packagename;
235 /* utf_init ********************************************************************
237 Initializes the utf8 subsystem.
239 *******************************************************************************/
243 TRACESUBSYSTEMINITIALIZATION("utf8_init");
245 /* create utf8 hashtable */
247 hashtable_utf = NEW(hashtable);
249 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
251 #if defined(ENABLE_STATISTICS)
253 count_utf_len += sizeof(utf*) * hashtable_utf->size;
256 /* create utf-symbols for pointer comparison of frequently used strings */
258 utf_java_lang_Object = utf_new_char("java/lang/Object");
260 utf_java_lang_Class = utf_new_char("java/lang/Class");
261 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
262 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
263 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
264 utf_java_lang_String = utf_new_char("java/lang/String");
265 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
267 utf_java_lang_ClassLoader_NativeLibrary =
268 utf_new_char("java/lang/ClassLoader$NativeLibrary");
270 utf_java_lang_ref_SoftReference =
271 utf_new_char("java/lang/ref/SoftReference");
273 utf_java_lang_ref_WeakReference =
274 utf_new_char("java/lang/ref/WeakReference");
276 utf_java_lang_ref_PhantomReference =
277 utf_new_char("java/lang/ref/PhantomReference");
279 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
281 utf_java_lang_Throwable = utf_new_char("java/lang/Throwable");
282 utf_java_lang_Error = utf_new_char("java/lang/Error");
284 utf_java_lang_ClassCircularityError =
285 utf_new_char("java/lang/ClassCircularityError");
287 utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
289 utf_java_lang_ExceptionInInitializerError =
290 utf_new_char("java/lang/ExceptionInInitializerError");
292 utf_java_lang_IncompatibleClassChangeError =
293 utf_new_char("java/lang/IncompatibleClassChangeError");
295 utf_java_lang_InstantiationError =
296 utf_new_char("java/lang/InstantiationError");
298 utf_java_lang_InternalError = utf_new_char("java/lang/InternalError");
299 utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError");
301 utf_java_lang_NoClassDefFoundError =
302 utf_new_char("java/lang/NoClassDefFoundError");
304 utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
306 utf_java_lang_UnsatisfiedLinkError =
307 utf_new_char("java/lang/UnsatisfiedLinkError");
309 utf_java_lang_UnsupportedClassVersionError =
310 utf_new_char("java/lang/UnsupportedClassVersionError");
312 utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError");
314 utf_java_lang_VirtualMachineError =
315 utf_new_char("java/lang/VirtualMachineError");
317 #if defined(ENABLE_JAVASE)
318 utf_java_lang_AbstractMethodError =
319 utf_new_char("java/lang/AbstractMethodError");
321 utf_java_lang_NoSuchFieldError =
322 utf_new_char("java/lang/NoSuchFieldError");
324 utf_java_lang_NoSuchMethodError =
325 utf_new_char("java/lang/NoSuchMethodError");
328 utf_java_lang_Exception = utf_new_char("java/lang/Exception");
330 utf_java_lang_ArithmeticException =
331 utf_new_char("java/lang/ArithmeticException");
333 utf_java_lang_ArrayIndexOutOfBoundsException =
334 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
336 utf_java_lang_ArrayStoreException =
337 utf_new_char("java/lang/ArrayStoreException");
339 utf_java_lang_ClassCastException =
340 utf_new_char("java/lang/ClassCastException");
342 utf_java_lang_ClassNotFoundException =
343 utf_new_char("java/lang/ClassNotFoundException");
345 utf_java_lang_CloneNotSupportedException =
346 utf_new_char("java/lang/CloneNotSupportedException");
348 utf_java_lang_IllegalAccessException =
349 utf_new_char("java/lang/IllegalAccessException");
351 utf_java_lang_IllegalArgumentException =
352 utf_new_char("java/lang/IllegalArgumentException");
354 utf_java_lang_IllegalMonitorStateException =
355 utf_new_char("java/lang/IllegalMonitorStateException");
357 utf_java_lang_InstantiationException =
358 utf_new_char("java/lang/InstantiationException");
360 utf_java_lang_InterruptedException =
361 utf_new_char("java/lang/InterruptedException");
363 utf_java_lang_NegativeArraySizeException =
364 utf_new_char("java/lang/NegativeArraySizeException");
366 utf_java_lang_NullPointerException =
367 utf_new_char("java/lang/NullPointerException");
369 utf_java_lang_RuntimeException =
370 utf_new_char("java/lang/RuntimeException");
372 utf_java_lang_StringIndexOutOfBoundsException =
373 utf_new_char("java/lang/StringIndexOutOfBoundsException");
375 utf_java_lang_reflect_InvocationTargetException =
376 utf_new_char("java/lang/reflect/InvocationTargetException");
378 utf_java_security_PrivilegedActionException =
379 utf_new_char("java/security/PrivilegedActionException");
381 #if defined(ENABLE_JAVASE)
382 utf_java_lang_Void = utf_new_char("java/lang/Void");
385 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
386 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
387 utf_java_lang_Character = utf_new_char("java/lang/Character");
388 utf_java_lang_Short = utf_new_char("java/lang/Short");
389 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
390 utf_java_lang_Long = utf_new_char("java/lang/Long");
391 utf_java_lang_Float = utf_new_char("java/lang/Float");
392 utf_java_lang_Double = utf_new_char("java/lang/Double");
394 #if defined(ENABLE_JAVASE)
395 utf_java_lang_StackTraceElement =
396 utf_new_char("java/lang/StackTraceElement");
398 utf_java_lang_reflect_Constructor =
399 utf_new_char("java/lang/reflect/Constructor");
401 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
402 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
404 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
405 utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
406 utf_java_lang_reflect_VMField = utf_new_char("java/lang/reflect/VMField");
407 utf_java_lang_reflect_VMMethod = utf_new_char("java/lang/reflect/VMMethod");
410 utf_java_util_Vector = utf_new_char("java/util/Vector");
413 utf_InnerClasses = utf_new_char("InnerClasses");
414 utf_ConstantValue = utf_new_char("ConstantValue");
415 utf_Code = utf_new_char("Code");
416 utf_Exceptions = utf_new_char("Exceptions");
417 utf_LineNumberTable = utf_new_char("LineNumberTable");
418 utf_SourceFile = utf_new_char("SourceFile");
420 #if defined(ENABLE_JAVASE)
421 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
422 utf_Signature = utf_new_char("Signature");
423 utf_StackMapTable = utf_new_char("StackMapTable");
425 # if defined(ENABLE_JVMTI)
426 utf_LocalVariableTable = utf_new_char("LocalVariableTable");
429 # if defined(ENABLE_ANNOTATIONS)
430 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
431 utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations");
432 utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations");
433 utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
434 utf_AnnotationDefault = utf_new_char("AnnotationDefault");
438 utf_init = utf_new_char("<init>");
439 utf_clinit = utf_new_char("<clinit>");
440 utf_clone = utf_new_char("clone");
441 utf_finalize = utf_new_char("finalize");
442 utf_invoke = utf_new_char("invoke");
443 utf_main = utf_new_char("main");
444 utf_run = utf_new_char("run");
446 utf_add = utf_new_char("add");
447 utf_dispatch = utf_new_char("dispatch");
448 utf_remove = utf_new_char("remove");
449 utf_addThread = utf_new_char("addThread");
450 utf_removeThread = utf_new_char("removeThread");
451 utf_put = utf_new_char("put");
452 utf_get = utf_new_char("get");
453 utf_uncaughtException = utf_new_char("uncaughtException");
454 utf_value = utf_new_char("value");
456 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
457 utf_findNative = utf_new_char("findNative");
458 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
459 utf_initCause = utf_new_char("initCause");
460 utf_loadClass = utf_new_char("loadClass");
461 utf_loadClassInternal = utf_new_char("loadClassInternal");
462 utf_printStackTrace = utf_new_char("printStackTrace");
464 utf_division_by_zero = utf_new_char("/ by zero");
466 utf_Z = utf_new_char("Z");
467 utf_B = utf_new_char("B");
468 utf_C = utf_new_char("C");
469 utf_S = utf_new_char("S");
470 utf_I = utf_new_char("I");
471 utf_J = utf_new_char("J");
472 utf_F = utf_new_char("F");
473 utf_D = utf_new_char("D");
475 utf_void__void = utf_new_char("()V");
476 utf_boolean__void = utf_new_char("(Z)V");
477 utf_byte__void = utf_new_char("(B)V");
478 utf_char__void = utf_new_char("(C)V");
479 utf_short__void = utf_new_char("(S)V");
480 utf_int__void = utf_new_char("(I)V");
481 utf_long__void = utf_new_char("(J)V");
482 utf_float__void = utf_new_char("(F)V");
483 utf_double__void = utf_new_char("(D)V");
484 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
485 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
487 utf_void__java_lang_ClassLoader =
488 utf_new_char("()Ljava/lang/ClassLoader;");
490 utf_java_lang_ClassLoader_java_lang_String__J =
491 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
493 utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V");
495 utf_java_lang_Object__java_lang_Object =
496 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
498 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
500 utf_java_lang_String__java_lang_Class =
501 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
503 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
505 utf_java_lang_Thread_java_lang_Throwable__V =
506 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
508 utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
509 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
511 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
513 utf_java_lang_Throwable__java_lang_Throwable =
514 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
516 utf_null = utf_new_char("null");
517 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
518 array_packagename = utf_new_char("\t<the array package>");
522 /* utf_hashkey *****************************************************************
524 The hashkey is computed from the utf-text by using up to 8
525 characters. For utf-symbols longer than 15 characters 3 characters
526 are taken from the beginning and the end, 2 characters are taken
529 *******************************************************************************/
531 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
532 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
534 u4 utf_hashkey(const char *text, u4 length)
536 const char *start_pos = text; /* pointer to utf text */
540 case 0: /* empty string */
543 case 1: return fbs(0);
544 case 2: return fbs(0) ^ nbs(3);
545 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
546 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
547 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
548 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
549 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
550 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
557 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
566 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
575 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
587 return a ^ nbs(9) ^ nbs(10);
599 return a ^ nbs(9) ^ nbs(10);
610 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
621 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
623 default: /* 3 characters from beginning */
629 /* 2 characters from middle */
630 text = start_pos + (length / 2);
635 /* 3 characters from end */
636 text = start_pos + length - 4;
641 return a ^ nbs(10) ^ nbs(11);
645 /* utf_full_hashkey ************************************************************
647 This function computes a hash value using all bytes in the string.
649 The algorithm is the "One-at-a-time" algorithm as published
650 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
652 *******************************************************************************/
654 u4 utf_full_hashkey(const char *text, u4 length)
656 register const unsigned char *p = (const unsigned char *) text;
664 hash += (hash << 10);
668 hash ^= (hash >> 11);
669 hash += (hash << 15);
674 /* unicode_hashkey *************************************************************
676 Compute the hashkey of a unicode string.
678 *******************************************************************************/
680 u4 unicode_hashkey(u2 *text, u2 len)
682 return utf_hashkey((char *) text, len);
686 /* utf_new *********************************************************************
688 Creates a new utf-symbol, the text of the symbol is passed as a
689 u1-array. The function searches the utf-hashtable for a utf-symbol
690 with this text. On success the element returned, otherwise a new
691 hashtable element is created.
693 If the number of entries in the hashtable exceeds twice the size of
694 the hashtable slots a reorganization of the hashtable is done and
695 the utf symbols are copied to a new hashtable with doubled size.
697 *******************************************************************************/
699 utf *utf_new(const char *text, u2 length)
701 u4 key; /* hashkey computed from utf-text */
702 u4 slot; /* slot in hashtable */
703 utf *u; /* hashtable element */
706 Mutex_lock(hashtable_utf->mutex);
708 #if defined(ENABLE_STATISTICS)
713 key = utf_hashkey(text, length);
714 slot = key & (hashtable_utf->size - 1);
715 u = hashtable_utf->ptr[slot];
717 /* search external hash chain for utf-symbol */
720 if (u->blength == length) {
721 /* compare text of hashtable elements */
723 for (i = 0; i < length; i++)
724 if (text[i] != u->text[i])
727 #if defined(ENABLE_STATISTICS)
729 count_utf_new_found++;
732 /* symbol found in hashtable */
734 Mutex_unlock(hashtable_utf->mutex);
740 u = u->hashlink; /* next element in external chain */
743 /* location in hashtable found, create new utf element */
747 u->blength = length; /* length in bytes of utfstring */
748 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
749 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
751 memcpy(u->text, text, length); /* copy utf-text */
752 u->text[length] = '\0';
754 #if defined(ENABLE_STATISTICS)
756 count_utf_len += sizeof(utf) + length + 1;
759 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
760 hashtable_utf->entries++; /* update number of entries */
762 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
764 /* reorganization of hashtable, average length of the external
765 chains is approx. 2 */
767 hashtable *newhash; /* the new hashtable */
773 /* create new hashtable, double the size */
775 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
777 #if defined(ENABLE_STATISTICS)
779 count_utf_len += sizeof(utf*) * hashtable_utf->size;
782 /* transfer elements to new hashtable */
784 for (i = 0; i < hashtable_utf->size; i++) {
785 u = hashtable_utf->ptr[i];
789 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
791 u->hashlink = (utf *) newhash->ptr[slot];
792 newhash->ptr[slot] = u;
794 /* follow link in external hash chain */
800 /* dispose old table */
802 hashtable_free(hashtable_utf);
804 hashtable_utf = newhash;
807 Mutex_unlock(hashtable_utf->mutex);
813 /* utf_new_u2 ******************************************************************
815 Make utf symbol from u2 array, if isclassname is true '.' is
818 *******************************************************************************/
820 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
822 char *buffer; /* memory buffer for unicode characters */
823 char *pos; /* pointer to current position in buffer */
824 u4 left; /* unicode characters left */
825 u4 buflength; /* utf length in bytes of the u2 array */
826 utf *result; /* resulting utf-string */
829 /* determine utf length in bytes and allocate memory */
831 buflength = u2_utflength(unicode_pos, unicode_length);
832 buffer = MNEW(char, buflength);
837 for (i = 0; i++ < unicode_length; unicode_pos++) {
838 /* next unicode character */
841 if ((c != 0) && (c < 0x80)) {
844 if ((int) left < 0) break;
845 /* convert classname */
846 if (isclassname && c == '.')
851 } else if (c < 0x800) {
853 unsigned char high = c >> 6;
854 unsigned char low = c & 0x3F;
856 if ((int) left < 0) break;
857 *pos++ = high | 0xC0;
863 char mid = (c >> 6) & 0x3F;
866 if ((int) left < 0) break;
867 *pos++ = high | 0xE0;
873 /* insert utf-string into symbol-table */
874 result = utf_new(buffer,buflength);
876 MFREE(buffer, char, buflength);
882 /* utf_new_char ****************************************************************
884 Creates a new utf symbol, the text for this symbol is passed as a
885 c-string ( = char* ).
887 *******************************************************************************/
889 utf *utf_new_char(const char *text)
891 return utf_new(text, strlen(text));
895 /* utf_new_char_classname ******************************************************
897 Creates a new utf symbol, the text for this symbol is passed as a
898 c-string ( = char* ) "." characters are going to be replaced by
899 "/". Since the above function is used often, this is a separte
900 function, instead of an if.
902 *******************************************************************************/
904 utf *utf_new_char_classname(const char *text)
906 if (strchr(text, '.')) {
907 char *txt = strdup(text);
908 char *end = txt + strlen(txt);
912 for (c = txt; c < end; c++)
913 if (*c == '.') *c = '/';
915 tmpRes = utf_new(txt, strlen(txt));
921 return utf_new(text, strlen(text));
925 /* utf_nextu2 ******************************************************************
927 Read the next unicode character from the utf string and increment
928 the utf-string pointer accordingly.
930 CAUTION: This function is unsafe for input that was not checked
933 *******************************************************************************/
935 u2 utf_nextu2(char **utf_ptr)
937 /* uncompressed unicode character */
939 /* current position in utf text */
940 unsigned char *utf = (unsigned char *) (*utf_ptr);
941 /* bytes representing the unicode character */
942 unsigned char ch1, ch2, ch3;
943 /* number of bytes used to represent the unicode character */
946 switch ((ch1 = utf[0]) >> 4) {
947 default: /* 1 byte */
951 case 0xD: /* 2 bytes */
952 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
953 unsigned char high = ch1 & 0x1F;
954 unsigned char low = ch2 & 0x3F;
955 unicode_char = (high << 6) + low;
960 case 0xE: /* 2 or 3 bytes */
961 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
962 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
963 unsigned char low = ch3 & 0x3f;
964 unsigned char mid = ch2 & 0x3f;
965 unsigned char high = ch1 & 0x0f;
966 unicode_char = (((high << 6) + mid) << 6) + low;
974 /* update position in utf-text */
975 *utf_ptr = (char *) (utf + len);
981 /* utf_bytes *******************************************************************
983 Determine number of bytes (aka. octets) in the utf string.
986 u............utf string
989 The number of octets of this utf string.
990 There is _no_ terminating zero included in this count.
992 *******************************************************************************/
1000 /* utf_get_number_of_u2s_for_buffer ********************************************
1002 Determine number of UTF-16 u2s in the given UTF-8 buffer
1004 CAUTION: This function is unsafe for input that was not checked
1007 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
1008 to an array of u2s (UTF-16) and want to know how many of them you will get.
1009 All other uses of this function are probably wrong.
1012 buffer........points to first char in buffer
1013 blength.......number of _bytes_ in the buffer
1016 the number of u2s needed to hold this string in UTF-16 encoding.
1017 There is _no_ terminating zero included in this count.
1019 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1022 *******************************************************************************/
1024 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1026 const char *endpos; /* points behind utf string */
1027 const char *utf_ptr; /* current position in utf text */
1028 u4 len = 0; /* number of unicode characters */
1031 endpos = utf_ptr + blength;
1033 while (utf_ptr < endpos) {
1035 /* next unicode character */
1036 utf_nextu2((char **)&utf_ptr);
1039 assert(utf_ptr == endpos);
1045 /* utf_get_number_of_u2s *******************************************************
1047 Determine number of UTF-16 u2s in the utf string.
1049 CAUTION: This function is unsafe for input that was not checked
1052 CAUTION: Use this function *only* when you want to convert a utf string
1053 to an array of u2s and want to know how many of them you will get.
1054 All other uses of this function are probably wrong.
1057 u............utf string
1060 the number of u2s needed to hold this string in UTF-16 encoding.
1061 There is _no_ terminating zero included in this count.
1062 XXX 0 if a NullPointerException has been thrown (see below)
1064 *******************************************************************************/
1066 u4 utf_get_number_of_u2s(utf *u)
1068 char *endpos; /* points behind utf string */
1069 char *utf_ptr; /* current position in utf text */
1070 u4 len = 0; /* number of unicode characters */
1072 /* XXX this is probably not checked by most callers! Review this after */
1073 /* the invalid uses of this function have been eliminated */
1075 exceptions_throw_nullpointerexception();
1079 endpos = UTF_END(u);
1082 while (utf_ptr < endpos) {
1084 /* next unicode character */
1085 utf_nextu2(&utf_ptr);
1088 if (utf_ptr != endpos) {
1089 /* string ended abruptly */
1090 exceptions_throw_internalerror("Illegal utf8 string");
1098 /* utf8_safe_number_of_u2s *****************************************************
1100 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1101 (For invalid UTF-8 the U+fffd replacement character will be counted.)
1103 This function is safe even for invalid UTF-8 strings.
1106 text..........zero-terminated(!) UTF-8 string (may be invalid)
1108 nbytes........strlen(text). (This is needed to completely emulate
1112 the number of u2s needed to hold this string in UTF-16 encoding.
1113 There is _no_ terminating zero included in this count.
1115 *******************************************************************************/
1117 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1118 register const unsigned char *t;
1121 register const unsigned char *tlimit;
1129 assert(nbytes >= 0);
1132 t = (const unsigned char *) text;
1133 tlimit = t + nbytes;
1135 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1141 /* highest bit set, non-ASCII character */
1143 if ((byte & 0xe0) == 0xc0) {
1144 /* 2-byte: should be 110..... 10...... ? */
1146 if ((*t++ & 0xc0) == 0x80)
1147 ; /* valid 2-byte */
1151 else if ((byte & 0xf0) == 0xe0) {
1152 /* 3-byte: should be 1110.... 10...... 10...... */
1156 return len + 1; /* invalid, stop here */
1158 if ((*t++ & 0xc0) == 0x80) {
1159 if ((*t++ & 0xc0) == 0x80)
1160 ; /* valid 3-byte */
1167 else if ((byte & 0xf8) == 0xf0) {
1168 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1172 return len + 1; /* invalid, stop here */
1174 if (((byte1 = *t++) & 0xc0) == 0x80) {
1175 if (((byte2 = *t++) & 0xc0) == 0x80) {
1176 if (((byte3 = *t++) & 0xc0) == 0x80) {
1177 /* valid 4-byte UTF-8? */
1178 value = ((byte & 0x07) << 18)
1179 | ((byte1 & 0x3f) << 12)
1180 | ((byte2 & 0x3f) << 6)
1181 | ((byte3 & 0x3f) );
1183 if (value > 0x10FFFF)
1185 else if (value > 0xFFFF)
1186 len += 1; /* we need surrogates */
1188 ; /* 16bit suffice */
1199 else if ((byte & 0xfc) == 0xf8) {
1200 /* invalid 5-byte */
1202 return len + 1; /* invalid, stop here */
1205 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1208 else if ((byte & 0xfe) == 0xfc) {
1209 /* invalid 6-byte */
1211 return len + 1; /* invalid, stop here */
1214 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1226 /* ASCII character, common case */
1236 /* utf8_safe_convert_to_u2s ****************************************************
1238 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1239 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1240 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1242 This function is safe even for invalid UTF-8 strings.
1245 text..........zero-terminated(!) UTF-8 string (may be invalid)
1247 nbytes........strlen(text). (This is needed to completely emulate
1249 buffer........a preallocated array of u2s to receive the decoded
1250 string. Use utf8_safe_number_of_u2s to get the
1251 required number of u2s for allocating this.
1253 *******************************************************************************/
1255 #define UNICODE_REPLACEMENT 0xfffd
1257 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1258 register const unsigned char *t;
1260 register const unsigned char *tlimit;
1268 assert(nbytes >= 0);
1270 t = (const unsigned char *) text;
1271 tlimit = t + nbytes;
1273 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1279 /* highest bit set, non-ASCII character */
1281 if ((byte & 0xe0) == 0xc0) {
1282 /* 2-byte: should be 110..... 10...... */
1284 if (((byte1 = *t++) & 0xc0) == 0x80) {
1285 /* valid 2-byte UTF-8 */
1286 *buffer++ = ((byte & 0x1f) << 6)
1287 | ((byte1 & 0x3f) );
1290 *buffer++ = UNICODE_REPLACEMENT;
1294 else if ((byte & 0xf0) == 0xe0) {
1295 /* 3-byte: should be 1110.... 10...... 10...... */
1297 if (t + 2 > tlimit) {
1298 *buffer++ = UNICODE_REPLACEMENT;
1302 if (((byte1 = *t++) & 0xc0) == 0x80) {
1303 if (((byte2 = *t++) & 0xc0) == 0x80) {
1304 /* valid 3-byte UTF-8 */
1305 *buffer++ = ((byte & 0x0f) << 12)
1306 | ((byte1 & 0x3f) << 6)
1307 | ((byte2 & 0x3f) );
1310 *buffer++ = UNICODE_REPLACEMENT;
1315 *buffer++ = UNICODE_REPLACEMENT;
1319 else if ((byte & 0xf8) == 0xf0) {
1320 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1322 if (t + 3 > tlimit) {
1323 *buffer++ = UNICODE_REPLACEMENT;
1327 if (((byte1 = *t++) & 0xc0) == 0x80) {
1328 if (((byte2 = *t++) & 0xc0) == 0x80) {
1329 if (((byte3 = *t++) & 0xc0) == 0x80) {
1330 /* valid 4-byte UTF-8? */
1331 value = ((byte & 0x07) << 18)
1332 | ((byte1 & 0x3f) << 12)
1333 | ((byte2 & 0x3f) << 6)
1334 | ((byte3 & 0x3f) );
1336 if (value > 0x10FFFF) {
1337 *buffer++ = UNICODE_REPLACEMENT;
1339 else if (value > 0xFFFF) {
1340 /* we need surrogates */
1341 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1342 *buffer++ = 0xdc00 | (value & 0x03ff);
1345 *buffer++ = value; /* 16bit suffice */
1348 *buffer++ = UNICODE_REPLACEMENT;
1353 *buffer++ = UNICODE_REPLACEMENT;
1358 *buffer++ = UNICODE_REPLACEMENT;
1362 else if ((byte & 0xfc) == 0xf8) {
1363 if (t + 4 > tlimit) {
1364 *buffer++ = UNICODE_REPLACEMENT;
1369 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1371 *buffer++ = UNICODE_REPLACEMENT;
1373 else if ((byte & 0xfe) == 0xfc) {
1374 if (t + 5 > tlimit) {
1375 *buffer++ = UNICODE_REPLACEMENT;
1380 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1382 *buffer++ = UNICODE_REPLACEMENT;
1385 *buffer++ = UNICODE_REPLACEMENT;
1393 /* ASCII character, common case */
1401 /* u2_utflength ****************************************************************
1403 Returns the utf length in bytes of a u2 array.
1405 *******************************************************************************/
1407 u4 u2_utflength(u2 *text, u4 u2_length)
1409 u4 result_len = 0; /* utf length in bytes */
1410 u2 ch; /* current unicode character */
1413 for (len = 0; len < u2_length; len++) {
1414 /* next unicode character */
1417 /* determine bytes required to store unicode character as utf */
1418 if (ch && (ch < 0x80))
1420 else if (ch < 0x800)
1430 /* utf_copy ********************************************************************
1432 Copy the given utf string byte-for-byte to a buffer.
1435 buffer.......the buffer
1436 u............the utf string
1438 *******************************************************************************/
1440 void utf_copy(char *buffer, utf *u)
1442 /* our utf strings are zero-terminated (done by utf_new) */
1443 MCOPY(buffer, u->text, char, u->blength + 1);
1447 /* utf_cat *********************************************************************
1449 Append the given utf string byte-for-byte to a buffer.
1452 buffer.......the buffer
1453 u............the utf string
1455 *******************************************************************************/
1457 void utf_cat(char *buffer, utf *u)
1459 /* our utf strings are zero-terminated (done by utf_new) */
1460 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1464 /* utf_copy_classname **********************************************************
1466 Copy the given utf classname byte-for-byte to a buffer.
1467 '/' is replaced by '.'
1470 buffer.......the buffer
1471 u............the utf string
1473 *******************************************************************************/
1475 void utf_copy_classname(char *buffer, utf *u)
1484 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1486 while (srcptr != endptr) {
1495 /* utf_cat *********************************************************************
1497 Append the given utf classname byte-for-byte to a buffer.
1498 '/' is replaced by '.'
1501 buffer.......the buffer
1502 u............the utf string
1504 *******************************************************************************/
1506 void utf_cat_classname(char *buffer, utf *u)
1508 utf_copy_classname(buffer + strlen(buffer), u);
1511 /* utf_display_printable_ascii *************************************************
1513 Write utf symbol to stdout (for debugging purposes).
1514 Non-printable and non-ASCII characters are printed as '?'.
1516 *******************************************************************************/
1518 void utf_display_printable_ascii(utf *u)
1520 char *endpos; /* points behind utf string */
1521 char *utf_ptr; /* current position in utf text */
1529 endpos = UTF_END(u);
1532 while (utf_ptr < endpos) {
1533 /* read next unicode character */
1535 u2 c = utf_nextu2(&utf_ptr);
1537 if ((c >= 32) && (c <= 127))
1547 /* utf_display_printable_ascii_classname ***************************************
1549 Write utf symbol to stdout with `/' converted to `.' (for debugging
1551 Non-printable and non-ASCII characters are printed as '?'.
1553 *******************************************************************************/
1555 void utf_display_printable_ascii_classname(utf *u)
1557 char *endpos; /* points behind utf string */
1558 char *utf_ptr; /* current position in utf text */
1566 endpos = UTF_END(u);
1569 while (utf_ptr < endpos) {
1570 /* read next unicode character */
1572 u2 c = utf_nextu2(&utf_ptr);
1577 if ((c >= 32) && (c <= 127))
1587 /* utf_sprint_convert_to_latin1 ************************************************
1589 Write utf symbol into c-string (for debugging purposes).
1590 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1593 *******************************************************************************/
1595 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1597 char *endpos; /* points behind utf string */
1598 char *utf_ptr; /* current position in utf text */
1599 u2 pos = 0; /* position in c-string */
1602 strcpy(buffer, "NULL");
1606 endpos = UTF_END(u);
1609 while (utf_ptr < endpos)
1610 /* copy next unicode character */
1611 buffer[pos++] = utf_nextu2(&utf_ptr);
1613 /* terminate string */
1618 /* utf_sprint_convert_to_latin1_classname **************************************
1620 Write utf symbol into c-string with `/' converted to `.' (for debugging
1622 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1625 *******************************************************************************/
1627 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1629 char *endpos; /* points behind utf string */
1630 char *utf_ptr; /* current position in utf text */
1631 u2 pos = 0; /* position in c-string */
1634 strcpy(buffer, "NULL");
1638 endpos = UTF_END(u);
1641 while (utf_ptr < endpos) {
1642 /* copy next unicode character */
1643 u2 c = utf_nextu2(&utf_ptr);
1644 if (c == '/') c = '.';
1648 /* terminate string */
1653 /* utf_strcat_convert_to_latin1 ************************************************
1655 Like libc strcat, but uses an utf8 string.
1656 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1659 *******************************************************************************/
1661 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1663 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1667 /* utf_strcat_convert_to_latin1_classname **************************************
1669 Like libc strcat, but uses an utf8 string.
1670 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1673 *******************************************************************************/
1675 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1677 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1681 /* utf_fprint_printable_ascii **************************************************
1683 Write utf symbol into file.
1684 Non-printable and non-ASCII characters are printed as '?'.
1686 *******************************************************************************/
1688 void utf_fprint_printable_ascii(FILE *file, utf *u)
1690 char *endpos; /* points behind utf string */
1691 char *utf_ptr; /* current position in utf text */
1696 endpos = UTF_END(u);
1699 while (utf_ptr < endpos) {
1700 /* read next unicode character */
1701 u2 c = utf_nextu2(&utf_ptr);
1703 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1704 else fprintf(file, "?");
1709 /* utf_fprint_printable_ascii_classname ****************************************
1711 Write utf symbol into file with `/' converted to `.'.
1712 Non-printable and non-ASCII characters are printed as '?'.
1714 *******************************************************************************/
1716 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1718 char *endpos; /* points behind utf string */
1719 char *utf_ptr; /* current position in utf text */
1724 endpos = UTF_END(u);
1727 while (utf_ptr < endpos) {
1728 /* read next unicode character */
1729 u2 c = utf_nextu2(&utf_ptr);
1730 if (c == '/') c = '.';
1732 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1733 else fprintf(file, "?");
1738 /* is_valid_utf ****************************************************************
1740 Return true if the given string is a valid UTF-8 string.
1742 utf_ptr...points to first character
1743 end_pos...points after last character
1745 *******************************************************************************/
1747 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1749 bool is_valid_utf(char *utf_ptr, char *end_pos)
1756 if (end_pos < utf_ptr) return false;
1757 bytes = end_pos - utf_ptr;
1761 if (!c) return false; /* 0x00 is not allowed */
1762 if ((c & 0x80) == 0) continue; /* ASCII */
1764 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1765 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1766 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1767 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1768 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1769 else return false; /* invalid leading byte */
1771 if (len > 2) return false; /* Java limitation */
1773 v = (unsigned long)c & (0x3f >> len);
1775 if ((bytes -= len) < 0) return false; /* missing bytes */
1777 for (i = len; i--; ) {
1779 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1781 v = (v << 6) | (c & 0x3f);
1785 if (len != 1) return false; /* Java special */
1788 /* Sun Java seems to allow overlong UTF-8 encodings */
1790 /* if (v < min_codepoint[len]) */
1791 /* XXX throw exception? */
1794 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1795 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1797 /* even these seem to be allowed */
1798 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1805 /* is_valid_name ***************************************************************
1807 Return true if the given string may be used as a class/field/method
1808 name. (Currently this only disallows empty strings and control
1811 NOTE: The string is assumed to have passed is_valid_utf!
1813 utf_ptr...points to first character
1814 end_pos...points after last character
1816 *******************************************************************************/
1818 bool is_valid_name(char *utf_ptr, char *end_pos)
1820 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1822 while (utf_ptr < end_pos) {
1823 unsigned char c = *utf_ptr++;
1825 if (c < 0x20) return false; /* disallow control characters */
1826 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1833 bool is_valid_name_utf(utf *u)
1835 return is_valid_name(u->text, UTF_END(u));
1839 /* utf_show ********************************************************************
1841 Writes the utf symbols in the utfhash to stdout and displays the
1842 number of external hash chains grouped according to the chainlength
1843 (for debugging purposes).
1845 *******************************************************************************/
1847 #if !defined(NDEBUG)
1851 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1853 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1854 u4 max_chainlength = 0; /* maximum length of the chains */
1855 u4 sum_chainlength = 0; /* sum of the chainlengths */
1856 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1859 printf("UTF-HASH:\n");
1861 /* show element of utf-hashtable */
1863 for (i = 0; i < hashtable_utf->size; i++) {
1864 utf *u = hashtable_utf->ptr[i];
1867 printf("SLOT %d: ", (int) i);
1871 utf_display_printable_ascii(u);
1879 printf("UTF-HASH: %d slots for %d entries\n",
1880 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1882 if (hashtable_utf->entries == 0)
1885 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1887 for (i=0;i<CHAIN_LIMIT;i++)
1890 /* count numbers of hashchains according to their length */
1891 for (i=0; i<hashtable_utf->size; i++) {
1893 utf *u = (utf*) hashtable_utf->ptr[i];
1894 u4 chain_length = 0;
1896 /* determine chainlength */
1902 /* update sum of all chainlengths */
1903 sum_chainlength+=chain_length;
1905 /* determine the maximum length of the chains */
1906 if (chain_length>max_chainlength)
1907 max_chainlength = chain_length;
1909 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1910 if (chain_length>=CHAIN_LIMIT) {
1911 beyond_limit+=chain_length;
1912 chain_length=CHAIN_LIMIT-1;
1915 /* update number of hashchains of current length */
1916 chain_count[chain_length]++;
1919 /* display results */
1920 for (i=1;i<CHAIN_LIMIT-1;i++)
1921 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1923 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1926 printf("max. chainlength:%5d\n",max_chainlength);
1928 /* avg. chainlength = sum of chainlengths / number of chains */
1929 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1931 #endif /* !defined(NDEBUG) */
1935 * These are local overrides for various environment variables in Emacs.
1936 * Please do not remove this and leave it at the end of the file, where
1937 * Emacs will automagically detect them.
1938 * ---------------------------------------------------------------------
1941 * indent-tabs-mode: t
1945 * vim:noexpandtab:sw=4:ts=4: