/* src/vm/utf8.c - utf8 string functions Copyright (C) 1996-2005, 2006, 2007, 2008 CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO This file is part of CACAO. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include #include #include "vm/types.h" #include "mm/memory.hpp" #include "threads/mutex.hpp" #include "toolbox/hashtable.h" #include "vm/exceptions.hpp" #include "vm/options.h" #if defined(ENABLE_STATISTICS) # include "vm/statistics.h" #endif #include "vm/utf8.h" /* global variables ***********************************************************/ /* hashsize must be power of 2 */ #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */ hashtable *hashtable_utf; /* hashtable for utf8-symbols */ /* utf-symbols for pointer comparison of frequently used strings **************/ utf *utf_java_lang_Object; utf *utf_java_lang_Class; utf *utf_java_lang_ClassLoader; utf *utf_java_lang_Cloneable; utf *utf_java_lang_SecurityManager; utf *utf_java_lang_String; utf *utf_java_lang_ThreadGroup; utf *utf_java_lang_ref_SoftReference; utf *utf_java_lang_ref_WeakReference; utf *utf_java_lang_ref_PhantomReference; utf *utf_java_io_Serializable; utf *utf_java_lang_Throwable; utf *utf_java_lang_Error; utf *utf_java_lang_AbstractMethodError; utf *utf_java_lang_ClassCircularityError; utf *utf_java_lang_ClassFormatError; utf *utf_java_lang_ExceptionInInitializerError; utf *utf_java_lang_IncompatibleClassChangeError; utf *utf_java_lang_InstantiationError; utf *utf_java_lang_InternalError; utf *utf_java_lang_LinkageError; utf *utf_java_lang_NoClassDefFoundError; utf *utf_java_lang_NoSuchFieldError; utf *utf_java_lang_NoSuchMethodError; utf *utf_java_lang_OutOfMemoryError; utf *utf_java_lang_UnsatisfiedLinkError; utf *utf_java_lang_UnsupportedClassVersionError; utf *utf_java_lang_VerifyError; utf *utf_java_lang_VirtualMachineError; utf *utf_java_lang_Exception; utf *utf_java_lang_ArithmeticException; utf *utf_java_lang_ArrayIndexOutOfBoundsException; utf *utf_java_lang_ArrayStoreException; utf *utf_java_lang_ClassCastException; utf *utf_java_lang_ClassNotFoundException; utf *utf_java_lang_CloneNotSupportedException; utf *utf_java_lang_IllegalAccessException; utf *utf_java_lang_IllegalArgumentException; utf *utf_java_lang_IllegalMonitorStateException; utf *utf_java_lang_InstantiationException; utf *utf_java_lang_InterruptedException; utf *utf_java_lang_NegativeArraySizeException; utf *utf_java_lang_NullPointerException; utf *utf_java_lang_RuntimeException; utf *utf_java_lang_StringIndexOutOfBoundsException; utf *utf_java_lang_reflect_InvocationTargetException; utf *utf_java_security_PrivilegedActionException; #if defined(ENABLE_JAVASE) utf* utf_java_lang_Void; #endif utf* utf_java_lang_Boolean; utf* utf_java_lang_Byte; utf* utf_java_lang_Character; utf* utf_java_lang_Short; utf* utf_java_lang_Integer; utf* utf_java_lang_Long; utf* utf_java_lang_Float; utf* utf_java_lang_Double; #if defined(ENABLE_JAVASE) utf *utf_java_lang_StackTraceElement; utf *utf_java_lang_reflect_Constructor; utf *utf_java_lang_reflect_Field; utf *utf_java_lang_reflect_Method; # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH) utf *utf_java_lang_reflect_VMConstructor; utf *utf_java_lang_reflect_VMField; utf *utf_java_lang_reflect_VMMethod; # endif utf *utf_java_util_Vector; #endif utf *utf_InnerClasses; /* InnerClasses */ utf *utf_ConstantValue; /* ConstantValue */ utf *utf_Code; /* Code */ utf *utf_Exceptions; /* Exceptions */ utf *utf_LineNumberTable; /* LineNumberTable */ utf *utf_SourceFile; /* SourceFile */ #if defined(ENABLE_JAVASE) utf *utf_EnclosingMethod; utf *utf_Signature; utf *utf_StackMapTable; # if defined(ENABLE_JVMTI) utf *utf_LocalVariableTable; # endif # if defined(ENABLE_ANNOTATIONS) utf *utf_RuntimeVisibleAnnotations; /* RuntimeVisibleAnnotations */ utf *utf_RuntimeInvisibleAnnotations; /* RuntimeInvisibleAnnotations */ utf *utf_RuntimeVisibleParameterAnnotations; /* RuntimeVisibleParameterAnnotations */ utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */ utf *utf_AnnotationDefault; /* AnnotationDefault */ # endif #endif utf *utf_init; /* */ utf *utf_clinit; /* */ utf *utf_clone; /* clone */ utf *utf_finalize; /* finalize */ utf *utf_invoke; utf *utf_main; utf *utf_run; /* run */ utf *utf_add; utf *utf_dispatch; utf *utf_remove; utf *utf_addThread; utf *utf_removeThread; utf *utf_put; utf *utf_get; utf *utf_uncaughtException; utf *utf_value; utf *utf_fillInStackTrace; utf *utf_findNative; utf *utf_getSystemClassLoader; utf *utf_initCause; utf *utf_loadClass; utf *utf_loadClassInternal; utf *utf_printStackTrace; utf *utf_division_by_zero; utf *utf_Z; /* Z */ utf *utf_B; /* B */ utf *utf_C; /* C */ utf *utf_S; /* S */ utf *utf_I; /* I */ utf *utf_J; /* J */ utf *utf_F; /* F */ utf *utf_D; /* D */ utf *utf_void__void; /* ()V */ utf *utf_boolean__void; /* (Z)V */ utf *utf_byte__void; /* (B)V */ utf *utf_char__void; /* (C)V */ utf *utf_short__void; /* (S)V */ utf *utf_int__void; /* (I)V */ utf *utf_long__void; /* (J)V */ utf *utf_float__void; /* (F)V */ utf *utf_double__void; /* (D)V */ utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */ utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */ utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */ utf *utf_java_lang_ClassLoader_java_lang_String__J; utf *utf_java_lang_Exception__V; /* (Ljava/lang/Exception;)V */ utf *utf_java_lang_Object__java_lang_Object; utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */ utf *utf_java_lang_String__java_lang_Class; utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */ utf *utf_java_lang_Thread_java_lang_Throwable__V; utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V; utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */ utf *utf_java_lang_Throwable__java_lang_Throwable; utf *utf_not_named_yet; /* special name for unnamed classes */ utf *utf_null; utf *array_packagename; /* utf_init ******************************************************************** Initializes the utf8 subsystem. *******************************************************************************/ void utf8_init(void) { TRACESUBSYSTEMINITIALIZATION("utf8_init"); /* create utf8 hashtable */ hashtable_utf = NEW(hashtable); hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE); #if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_len += sizeof(utf*) * hashtable_utf->size; #endif /* create utf-symbols for pointer comparison of frequently used strings */ utf_java_lang_Object = utf_new_char("java/lang/Object"); utf_java_lang_Class = utf_new_char("java/lang/Class"); utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader"); utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable"); utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager"); utf_java_lang_String = utf_new_char("java/lang/String"); utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup"); utf_java_lang_ref_SoftReference = utf_new_char("java/lang/ref/SoftReference"); utf_java_lang_ref_WeakReference = utf_new_char("java/lang/ref/WeakReference"); utf_java_lang_ref_PhantomReference = utf_new_char("java/lang/ref/PhantomReference"); utf_java_io_Serializable = utf_new_char("java/io/Serializable"); utf_java_lang_Throwable = utf_new_char("java/lang/Throwable"); utf_java_lang_Error = utf_new_char("java/lang/Error"); utf_java_lang_ClassCircularityError = utf_new_char("java/lang/ClassCircularityError"); utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError"); utf_java_lang_ExceptionInInitializerError = utf_new_char("java/lang/ExceptionInInitializerError"); utf_java_lang_IncompatibleClassChangeError = utf_new_char("java/lang/IncompatibleClassChangeError"); utf_java_lang_InstantiationError = utf_new_char("java/lang/InstantiationError"); utf_java_lang_InternalError = utf_new_char("java/lang/InternalError"); utf_java_lang_LinkageError = utf_new_char("java/lang/LinkageError"); utf_java_lang_NoClassDefFoundError = utf_new_char("java/lang/NoClassDefFoundError"); utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError"); utf_java_lang_UnsatisfiedLinkError = utf_new_char("java/lang/UnsatisfiedLinkError"); utf_java_lang_UnsupportedClassVersionError = utf_new_char("java/lang/UnsupportedClassVersionError"); utf_java_lang_VerifyError = utf_new_char("java/lang/VerifyError"); utf_java_lang_VirtualMachineError = utf_new_char("java/lang/VirtualMachineError"); #if defined(ENABLE_JAVASE) utf_java_lang_AbstractMethodError = utf_new_char("java/lang/AbstractMethodError"); utf_java_lang_NoSuchFieldError = utf_new_char("java/lang/NoSuchFieldError"); utf_java_lang_NoSuchMethodError = utf_new_char("java/lang/NoSuchMethodError"); #endif utf_java_lang_Exception = utf_new_char("java/lang/Exception"); utf_java_lang_ArithmeticException = utf_new_char("java/lang/ArithmeticException"); utf_java_lang_ArrayIndexOutOfBoundsException = utf_new_char("java/lang/ArrayIndexOutOfBoundsException"); utf_java_lang_ArrayStoreException = utf_new_char("java/lang/ArrayStoreException"); utf_java_lang_ClassCastException = utf_new_char("java/lang/ClassCastException"); utf_java_lang_ClassNotFoundException = utf_new_char("java/lang/ClassNotFoundException"); utf_java_lang_CloneNotSupportedException = utf_new_char("java/lang/CloneNotSupportedException"); utf_java_lang_IllegalAccessException = utf_new_char("java/lang/IllegalAccessException"); utf_java_lang_IllegalArgumentException = utf_new_char("java/lang/IllegalArgumentException"); utf_java_lang_IllegalMonitorStateException = utf_new_char("java/lang/IllegalMonitorStateException"); utf_java_lang_InstantiationException = utf_new_char("java/lang/InstantiationException"); utf_java_lang_InterruptedException = utf_new_char("java/lang/InterruptedException"); utf_java_lang_NegativeArraySizeException = utf_new_char("java/lang/NegativeArraySizeException"); utf_java_lang_NullPointerException = utf_new_char("java/lang/NullPointerException"); utf_java_lang_RuntimeException = utf_new_char("java/lang/RuntimeException"); utf_java_lang_StringIndexOutOfBoundsException = utf_new_char("java/lang/StringIndexOutOfBoundsException"); utf_java_lang_reflect_InvocationTargetException = utf_new_char("java/lang/reflect/InvocationTargetException"); utf_java_security_PrivilegedActionException = utf_new_char("java/security/PrivilegedActionException"); #if defined(ENABLE_JAVASE) utf_java_lang_Void = utf_new_char("java/lang/Void"); #endif utf_java_lang_Boolean = utf_new_char("java/lang/Boolean"); utf_java_lang_Byte = utf_new_char("java/lang/Byte"); utf_java_lang_Character = utf_new_char("java/lang/Character"); utf_java_lang_Short = utf_new_char("java/lang/Short"); utf_java_lang_Integer = utf_new_char("java/lang/Integer"); utf_java_lang_Long = utf_new_char("java/lang/Long"); utf_java_lang_Float = utf_new_char("java/lang/Float"); utf_java_lang_Double = utf_new_char("java/lang/Double"); #if defined(ENABLE_JAVASE) utf_java_lang_StackTraceElement = utf_new_char("java/lang/StackTraceElement"); utf_java_lang_reflect_Constructor = utf_new_char("java/lang/reflect/Constructor"); utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field"); utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method"); # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH) utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor"); utf_java_lang_reflect_VMField = utf_new_char("java/lang/reflect/VMField"); utf_java_lang_reflect_VMMethod = utf_new_char("java/lang/reflect/VMMethod"); # endif utf_java_util_Vector = utf_new_char("java/util/Vector"); #endif utf_InnerClasses = utf_new_char("InnerClasses"); utf_ConstantValue = utf_new_char("ConstantValue"); utf_Code = utf_new_char("Code"); utf_Exceptions = utf_new_char("Exceptions"); utf_LineNumberTable = utf_new_char("LineNumberTable"); utf_SourceFile = utf_new_char("SourceFile"); #if defined(ENABLE_JAVASE) utf_EnclosingMethod = utf_new_char("EnclosingMethod"); utf_Signature = utf_new_char("Signature"); utf_StackMapTable = utf_new_char("StackMapTable"); # if defined(ENABLE_JVMTI) utf_LocalVariableTable = utf_new_char("LocalVariableTable"); # endif # if defined(ENABLE_ANNOTATIONS) utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations"); utf_RuntimeInvisibleAnnotations = utf_new_char("RuntimeInvisibleAnnotations"); utf_RuntimeVisibleParameterAnnotations = utf_new_char("RuntimeVisibleParameterAnnotations"); utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations"); utf_AnnotationDefault = utf_new_char("AnnotationDefault"); # endif #endif utf_init = utf_new_char(""); utf_clinit = utf_new_char(""); utf_clone = utf_new_char("clone"); utf_finalize = utf_new_char("finalize"); utf_invoke = utf_new_char("invoke"); utf_main = utf_new_char("main"); utf_run = utf_new_char("run"); utf_add = utf_new_char("add"); utf_dispatch = utf_new_char("dispatch"); utf_remove = utf_new_char("remove"); utf_addThread = utf_new_char("addThread"); utf_removeThread = utf_new_char("removeThread"); utf_put = utf_new_char("put"); utf_get = utf_new_char("get"); utf_uncaughtException = utf_new_char("uncaughtException"); utf_value = utf_new_char("value"); utf_fillInStackTrace = utf_new_char("fillInStackTrace"); utf_findNative = utf_new_char("findNative"); utf_getSystemClassLoader = utf_new_char("getSystemClassLoader"); utf_initCause = utf_new_char("initCause"); utf_loadClass = utf_new_char("loadClass"); utf_loadClassInternal = utf_new_char("loadClassInternal"); utf_printStackTrace = utf_new_char("printStackTrace"); utf_division_by_zero = utf_new_char("/ by zero"); utf_Z = utf_new_char("Z"); utf_B = utf_new_char("B"); utf_C = utf_new_char("C"); utf_S = utf_new_char("S"); utf_I = utf_new_char("I"); utf_J = utf_new_char("J"); utf_F = utf_new_char("F"); utf_D = utf_new_char("D"); utf_void__void = utf_new_char("()V"); utf_boolean__void = utf_new_char("(Z)V"); utf_byte__void = utf_new_char("(B)V"); utf_char__void = utf_new_char("(C)V"); utf_short__void = utf_new_char("(S)V"); utf_int__void = utf_new_char("(I)V"); utf_long__void = utf_new_char("(J)V"); utf_float__void = utf_new_char("(F)V"); utf_double__void = utf_new_char("(D)V"); utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;"); utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;"); utf_void__java_lang_ClassLoader = utf_new_char("()Ljava/lang/ClassLoader;"); utf_java_lang_ClassLoader_java_lang_String__J = utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J"); utf_java_lang_Exception__V = utf_new_char("(Ljava/lang/Exception;)V"); utf_java_lang_Object__java_lang_Object = utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;"); utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V"); utf_java_lang_String__java_lang_Class = utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;"); utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V"); utf_java_lang_Thread_java_lang_Throwable__V = utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V"); utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V = utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V"); utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V"); utf_java_lang_Throwable__java_lang_Throwable = utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;"); utf_null = utf_new_char("null"); utf_not_named_yet = utf_new_char("\t"); array_packagename = utf_new_char("\t"); } /* utf_hashkey ***************************************************************** The hashkey is computed from the utf-text by using up to 8 characters. For utf-symbols longer than 15 characters 3 characters are taken from the beginning and the end, 2 characters are taken from the middle. *******************************************************************************/ #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */ #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */ u4 utf_hashkey(const char *text, u4 length) { const char *start_pos = text; /* pointer to utf text */ u4 a; switch (length) { case 0: /* empty string */ return 0; case 1: return fbs(0); case 2: return fbs(0) ^ nbs(3); case 3: return fbs(0) ^ nbs(3) ^ nbs(5); case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6); case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6); case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6); case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6); case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7); case 9: a = fbs(0); a ^= nbs(1); a ^= nbs(2); text++; return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8); case 10: a = fbs(0); text++; a ^= nbs(2); a ^= nbs(3); a ^= nbs(4); text++; return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9); case 11: a = fbs(0); text++; a ^= nbs(2); a ^= nbs(3); a ^= nbs(4); text++; return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10); case 12: a = fbs(0); text += 2; a ^= nbs(2); a ^= nbs(3); text++; a ^= nbs(5); a ^= nbs(6); a ^= nbs(7); text++; return a ^ nbs(9) ^ nbs(10); case 13: a = fbs(0); a ^= nbs(1); text++; a ^= nbs(3); a ^= nbs(4); text += 2; a ^= nbs(7); a ^= nbs(8); text += 2; return a ^ nbs(9) ^ nbs(10); case 14: a = fbs(0); text += 2; a ^= nbs(3); a ^= nbs(4); text += 2; a ^= nbs(7); a ^= nbs(8); text += 2; return a ^ nbs(9) ^ nbs(10) ^ nbs(11); case 15: a = fbs(0); text += 2; a ^= nbs(3); a ^= nbs(4); text += 2; a ^= nbs(7); a ^= nbs(8); text += 2; return a ^ nbs(9) ^ nbs(10) ^ nbs(11); default: /* 3 characters from beginning */ a = fbs(0); text += 2; a ^= nbs(3); a ^= nbs(4); /* 2 characters from middle */ text = start_pos + (length / 2); a ^= fbs(5); text += 2; a ^= nbs(6); /* 3 characters from end */ text = start_pos + length - 4; a ^= fbs(7); text++; return a ^ nbs(10) ^ nbs(11); } } /* utf_full_hashkey ************************************************************ This function computes a hash value using all bytes in the string. The algorithm is the "One-at-a-time" algorithm as published by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html. *******************************************************************************/ u4 utf_full_hashkey(const char *text, u4 length) { register const unsigned char *p = (const unsigned char *) text; register u4 hash; register u4 i; hash = 0; for (i=length; i--;) { hash += *p++; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } /* unicode_hashkey ************************************************************* Compute the hashkey of a unicode string. *******************************************************************************/ u4 unicode_hashkey(u2 *text, u2 len) { return utf_hashkey((char *) text, len); } /* utf_new ********************************************************************* Creates a new utf-symbol, the text of the symbol is passed as a u1-array. The function searches the utf-hashtable for a utf-symbol with this text. On success the element returned, otherwise a new hashtable element is created. If the number of entries in the hashtable exceeds twice the size of the hashtable slots a reorganization of the hashtable is done and the utf symbols are copied to a new hashtable with doubled size. *******************************************************************************/ utf *utf_new(const char *text, u2 length) { u4 key; /* hashkey computed from utf-text */ u4 slot; /* slot in hashtable */ utf *u; /* hashtable element */ u2 i; Mutex_lock(hashtable_utf->mutex); #if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_new++; #endif key = utf_hashkey(text, length); slot = key & (hashtable_utf->size - 1); u = hashtable_utf->ptr[slot]; /* search external hash chain for utf-symbol */ while (u) { if (u->blength == length) { /* compare text of hashtable elements */ for (i = 0; i < length; i++) if (text[i] != u->text[i]) goto nomatch; #if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_new_found++; #endif /* symbol found in hashtable */ Mutex_unlock(hashtable_utf->mutex); return u; } nomatch: u = u->hashlink; /* next element in external chain */ } /* location in hashtable found, create new utf element */ u = NEW(utf); u->blength = length; /* length in bytes of utfstring */ u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */ u->text = mem_alloc(length + 1);/* allocate memory for utf-text */ memcpy(u->text, text, length); /* copy utf-text */ u->text[length] = '\0'; #if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_len += sizeof(utf) + length + 1; #endif hashtable_utf->ptr[slot] = u; /* insert symbol into table */ hashtable_utf->entries++; /* update number of entries */ if (hashtable_utf->entries > (hashtable_utf->size * 2)) { /* reorganization of hashtable, average length of the external chains is approx. 2 */ hashtable *newhash; /* the new hashtable */ u4 i; utf *u; utf *nextu; u4 slot; /* create new hashtable, double the size */ newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2); #if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_len += sizeof(utf*) * hashtable_utf->size; #endif /* transfer elements to new hashtable */ for (i = 0; i < hashtable_utf->size; i++) { u = hashtable_utf->ptr[i]; while (u) { nextu = u->hashlink; slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1); u->hashlink = (utf *) newhash->ptr[slot]; newhash->ptr[slot] = u; /* follow link in external hash chain */ u = nextu; } } /* dispose old table */ hashtable_free(hashtable_utf); hashtable_utf = newhash; } Mutex_unlock(hashtable_utf->mutex); return u; } /* utf_new_u2 ****************************************************************** Make utf symbol from u2 array, if isclassname is true '.' is replaced by '/'. *******************************************************************************/ utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname) { char *buffer; /* memory buffer for unicode characters */ char *pos; /* pointer to current position in buffer */ u4 left; /* unicode characters left */ u4 buflength; /* utf length in bytes of the u2 array */ utf *result; /* resulting utf-string */ int i; /* determine utf length in bytes and allocate memory */ buflength = u2_utflength(unicode_pos, unicode_length); buffer = MNEW(char, buflength); left = buflength; pos = buffer; for (i = 0; i++ < unicode_length; unicode_pos++) { /* next unicode character */ u2 c = *unicode_pos; if ((c != 0) && (c < 0x80)) { /* 1 character */ left--; if ((int) left < 0) break; /* convert classname */ if (isclassname && c == '.') *pos++ = '/'; else *pos++ = (char) c; } else if (c < 0x800) { /* 2 characters */ unsigned char high = c >> 6; unsigned char low = c & 0x3F; left = left - 2; if ((int) left < 0) break; *pos++ = high | 0xC0; *pos++ = low | 0x80; } else { /* 3 characters */ char low = c & 0x3f; char mid = (c >> 6) & 0x3F; char high = c >> 12; left = left - 3; if ((int) left < 0) break; *pos++ = high | 0xE0; *pos++ = mid | 0x80; *pos++ = low | 0x80; } } /* insert utf-string into symbol-table */ result = utf_new(buffer,buflength); MFREE(buffer, char, buflength); return result; } /* utf_new_char **************************************************************** Creates a new utf symbol, the text for this symbol is passed as a c-string ( = char* ). *******************************************************************************/ utf *utf_new_char(const char *text) { return utf_new(text, strlen(text)); } /* utf_new_char_classname ****************************************************** Creates a new utf symbol, the text for this symbol is passed as a c-string ( = char* ) "." characters are going to be replaced by "/". Since the above function is used often, this is a separte function, instead of an if. *******************************************************************************/ utf *utf_new_char_classname(const char *text) { if (strchr(text, '.')) { char *txt = strdup(text); char *end = txt + strlen(txt); char *c; utf *tmpRes; for (c = txt; c < end; c++) if (*c == '.') *c = '/'; tmpRes = utf_new(txt, strlen(txt)); FREE(txt, 0); return tmpRes; } else return utf_new(text, strlen(text)); } /* utf_nextu2 ****************************************************************** Read the next unicode character from the utf string and increment the utf-string pointer accordingly. CAUTION: This function is unsafe for input that was not checked by is_valid_utf! *******************************************************************************/ u2 utf_nextu2(char **utf_ptr) { /* uncompressed unicode character */ u2 unicode_char = 0; /* current position in utf text */ unsigned char *utf = (unsigned char *) (*utf_ptr); /* bytes representing the unicode character */ unsigned char ch1, ch2, ch3; /* number of bytes used to represent the unicode character */ int len = 0; switch ((ch1 = utf[0]) >> 4) { default: /* 1 byte */ (*utf_ptr)++; return (u2) ch1; case 0xC: case 0xD: /* 2 bytes */ if (((ch2 = utf[1]) & 0xC0) == 0x80) { unsigned char high = ch1 & 0x1F; unsigned char low = ch2 & 0x3F; unicode_char = (high << 6) + low; len = 2; } break; case 0xE: /* 2 or 3 bytes */ if (((ch2 = utf[1]) & 0xC0) == 0x80) { if (((ch3 = utf[2]) & 0xC0) == 0x80) { unsigned char low = ch3 & 0x3f; unsigned char mid = ch2 & 0x3f; unsigned char high = ch1 & 0x0f; unicode_char = (((high << 6) + mid) << 6) + low; len = 3; } else len = 2; } break; } /* update position in utf-text */ *utf_ptr = (char *) (utf + len); return unicode_char; } /* utf_bytes ******************************************************************* Determine number of bytes (aka. octets) in the utf string. IN: u............utf string OUT: The number of octets of this utf string. There is _no_ terminating zero included in this count. *******************************************************************************/ u4 utf_bytes(utf *u) { return u->blength; } /* utf_get_number_of_u2s_for_buffer ******************************************** Determine number of UTF-16 u2s in the given UTF-8 buffer CAUTION: This function is unsafe for input that was not checked by is_valid_utf! CAUTION: Use this function *only* when you want to convert an UTF-8 buffer to an array of u2s (UTF-16) and want to know how many of them you will get. All other uses of this function are probably wrong. IN: buffer........points to first char in buffer blength.......number of _bytes_ in the buffer OUT: the number of u2s needed to hold this string in UTF-16 encoding. There is _no_ terminating zero included in this count. NOTE: Unlike utf_get_number_of_u2s, this function never throws an exception. *******************************************************************************/ u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength) { const char *endpos; /* points behind utf string */ const char *utf_ptr; /* current position in utf text */ u4 len = 0; /* number of unicode characters */ utf_ptr = buffer; endpos = utf_ptr + blength; while (utf_ptr < endpos) { len++; /* next unicode character */ utf_nextu2((char **)&utf_ptr); } assert(utf_ptr == endpos); return len; } /* utf_get_number_of_u2s ******************************************************* Determine number of UTF-16 u2s in the utf string. CAUTION: This function is unsafe for input that was not checked by is_valid_utf! CAUTION: Use this function *only* when you want to convert a utf string to an array of u2s and want to know how many of them you will get. All other uses of this function are probably wrong. IN: u............utf string OUT: the number of u2s needed to hold this string in UTF-16 encoding. There is _no_ terminating zero included in this count. XXX 0 if a NullPointerException has been thrown (see below) *******************************************************************************/ u4 utf_get_number_of_u2s(utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ u4 len = 0; /* number of unicode characters */ /* XXX this is probably not checked by most callers! Review this after */ /* the invalid uses of this function have been eliminated */ if (u == NULL) { exceptions_throw_nullpointerexception(); return 0; } endpos = UTF_END(u); utf_ptr = u->text; while (utf_ptr < endpos) { len++; /* next unicode character */ utf_nextu2(&utf_ptr); } if (utf_ptr != endpos) { /* string ended abruptly */ exceptions_throw_internalerror("Illegal utf8 string"); return 0; } return len; } /* utf8_safe_number_of_u2s ***************************************************** Determine number of UTF-16 u2s needed for decoding the given UTF-8 string. (For invalid UTF-8 the U+fffd replacement character will be counted.) This function is safe even for invalid UTF-8 strings. IN: text..........zero-terminated(!) UTF-8 string (may be invalid) must NOT be NULL nbytes........strlen(text). (This is needed to completely emulate the RI). OUT: the number of u2s needed to hold this string in UTF-16 encoding. There is _no_ terminating zero included in this count. *******************************************************************************/ s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) { register const unsigned char *t; register s4 byte; register s4 len; register const unsigned char *tlimit; s4 byte1; s4 byte2; s4 byte3; s4 value; s4 skip; assert(text); assert(nbytes >= 0); len = 0; t = (const unsigned char *) text; tlimit = t + nbytes; /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */ while (1) { byte = *t++; if (byte & 0x80) { /* highest bit set, non-ASCII character */ if ((byte & 0xe0) == 0xc0) { /* 2-byte: should be 110..... 10...... ? */ if ((*t++ & 0xc0) == 0x80) ; /* valid 2-byte */ else t--; /* invalid */ } else if ((byte & 0xf0) == 0xe0) { /* 3-byte: should be 1110.... 10...... 10...... */ /* ^t */ if (t + 2 > tlimit) return len + 1; /* invalid, stop here */ if ((*t++ & 0xc0) == 0x80) { if ((*t++ & 0xc0) == 0x80) ; /* valid 3-byte */ else t--; /* invalid */ } else t--; /* invalid */ } else if ((byte & 0xf8) == 0xf0) { /* 4-byte: should be 11110... 10...... 10...... 10...... */ /* ^t */ if (t + 3 > tlimit) return len + 1; /* invalid, stop here */ if (((byte1 = *t++) & 0xc0) == 0x80) { if (((byte2 = *t++) & 0xc0) == 0x80) { if (((byte3 = *t++) & 0xc0) == 0x80) { /* valid 4-byte UTF-8? */ value = ((byte & 0x07) << 18) | ((byte1 & 0x3f) << 12) | ((byte2 & 0x3f) << 6) | ((byte3 & 0x3f) ); if (value > 0x10FFFF) ; /* invalid */ else if (value > 0xFFFF) len += 1; /* we need surrogates */ else ; /* 16bit suffice */ } else t--; /* invalid */ } else t--; /* invalid */ } else t--; /* invalid */ } else if ((byte & 0xfc) == 0xf8) { /* invalid 5-byte */ if (t + 4 > tlimit) return len + 1; /* invalid, stop here */ skip = 4; for (; skip && ((*t & 0xc0) == 0x80); --skip) t++; } else if ((byte & 0xfe) == 0xfc) { /* invalid 6-byte */ if (t + 5 > tlimit) return len + 1; /* invalid, stop here */ skip = 5; for (; skip && ((*t & 0xc0) == 0x80); --skip) t++; } else ; /* invalid */ } else { /* NUL */ if (byte == 0) break; /* ASCII character, common case */ } len++; } return len; } /* utf8_safe_convert_to_u2s **************************************************** Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer. (Invalid UTF-8 will be replaced with the U+fffd replacement character.) Use utf8_safe_number_of_u2s to determine the number of u2s to allocate. This function is safe even for invalid UTF-8 strings. IN: text..........zero-terminated(!) UTF-8 string (may be invalid) must NOT be NULL nbytes........strlen(text). (This is needed to completely emulate the RI). buffer........a preallocated array of u2s to receive the decoded string. Use utf8_safe_number_of_u2s to get the required number of u2s for allocating this. *******************************************************************************/ #define UNICODE_REPLACEMENT 0xfffd void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) { register const unsigned char *t; register s4 byte; register const unsigned char *tlimit; s4 byte1; s4 byte2; s4 byte3; s4 value; s4 skip; assert(text); assert(nbytes >= 0); t = (const unsigned char *) text; tlimit = t + nbytes; /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */ while (1) { byte = *t++; if (byte & 0x80) { /* highest bit set, non-ASCII character */ if ((byte & 0xe0) == 0xc0) { /* 2-byte: should be 110..... 10...... */ if (((byte1 = *t++) & 0xc0) == 0x80) { /* valid 2-byte UTF-8 */ *buffer++ = ((byte & 0x1f) << 6) | ((byte1 & 0x3f) ); } else { *buffer++ = UNICODE_REPLACEMENT; t--; } } else if ((byte & 0xf0) == 0xe0) { /* 3-byte: should be 1110.... 10...... 10...... */ if (t + 2 > tlimit) { *buffer++ = UNICODE_REPLACEMENT; return; } if (((byte1 = *t++) & 0xc0) == 0x80) { if (((byte2 = *t++) & 0xc0) == 0x80) { /* valid 3-byte UTF-8 */ *buffer++ = ((byte & 0x0f) << 12) | ((byte1 & 0x3f) << 6) | ((byte2 & 0x3f) ); } else { *buffer++ = UNICODE_REPLACEMENT; t--; } } else { *buffer++ = UNICODE_REPLACEMENT; t--; } } else if ((byte & 0xf8) == 0xf0) { /* 4-byte: should be 11110... 10...... 10...... 10...... */ if (t + 3 > tlimit) { *buffer++ = UNICODE_REPLACEMENT; return; } if (((byte1 = *t++) & 0xc0) == 0x80) { if (((byte2 = *t++) & 0xc0) == 0x80) { if (((byte3 = *t++) & 0xc0) == 0x80) { /* valid 4-byte UTF-8? */ value = ((byte & 0x07) << 18) | ((byte1 & 0x3f) << 12) | ((byte2 & 0x3f) << 6) | ((byte3 & 0x3f) ); if (value > 0x10FFFF) { *buffer++ = UNICODE_REPLACEMENT; } else if (value > 0xFFFF) { /* we need surrogates */ *buffer++ = 0xd800 | ((value >> 10) - 0x40); *buffer++ = 0xdc00 | (value & 0x03ff); } else *buffer++ = value; /* 16bit suffice */ } else { *buffer++ = UNICODE_REPLACEMENT; t--; } } else { *buffer++ = UNICODE_REPLACEMENT; t--; } } else { *buffer++ = UNICODE_REPLACEMENT; t--; } } else if ((byte & 0xfc) == 0xf8) { if (t + 4 > tlimit) { *buffer++ = UNICODE_REPLACEMENT; return; } skip = 4; for (; skip && ((*t & 0xc0) == 0x80); --skip) t++; *buffer++ = UNICODE_REPLACEMENT; } else if ((byte & 0xfe) == 0xfc) { if (t + 5 > tlimit) { *buffer++ = UNICODE_REPLACEMENT; return; } skip = 5; for (; skip && ((*t & 0xc0) == 0x80); --skip) t++; *buffer++ = UNICODE_REPLACEMENT; } else *buffer++ = UNICODE_REPLACEMENT; } else { /* NUL */ if (byte == 0) break; /* ASCII character, common case */ *buffer++ = byte; } } } /* u2_utflength **************************************************************** Returns the utf length in bytes of a u2 array. *******************************************************************************/ u4 u2_utflength(u2 *text, u4 u2_length) { u4 result_len = 0; /* utf length in bytes */ u2 ch; /* current unicode character */ u4 len; for (len = 0; len < u2_length; len++) { /* next unicode character */ ch = *text++; /* determine bytes required to store unicode character as utf */ if (ch && (ch < 0x80)) result_len++; else if (ch < 0x800) result_len += 2; else result_len += 3; } return result_len; } /* utf_copy ******************************************************************** Copy the given utf string byte-for-byte to a buffer. IN: buffer.......the buffer u............the utf string *******************************************************************************/ void utf_copy(char *buffer, utf *u) { /* our utf strings are zero-terminated (done by utf_new) */ MCOPY(buffer, u->text, char, u->blength + 1); } /* utf_cat ********************************************************************* Append the given utf string byte-for-byte to a buffer. IN: buffer.......the buffer u............the utf string *******************************************************************************/ void utf_cat(char *buffer, utf *u) { /* our utf strings are zero-terminated (done by utf_new) */ MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1); } /* utf_copy_classname ********************************************************** Copy the given utf classname byte-for-byte to a buffer. '/' is replaced by '.' IN: buffer.......the buffer u............the utf string *******************************************************************************/ void utf_copy_classname(char *buffer, utf *u) { char *bufptr; char *srcptr; char *endptr; char ch; bufptr = buffer; srcptr = u->text; endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */ while (srcptr != endptr) { ch = *srcptr++; if (ch == '/') ch = '.'; *bufptr++ = ch; } } /* utf_cat ********************************************************************* Append the given utf classname byte-for-byte to a buffer. '/' is replaced by '.' IN: buffer.......the buffer u............the utf string *******************************************************************************/ void utf_cat_classname(char *buffer, utf *u) { utf_copy_classname(buffer + strlen(buffer), u); } /* utf_display_printable_ascii ************************************************* Write utf symbol to stdout (for debugging purposes). Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ void utf_display_printable_ascii(utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ if (u == NULL) { printf("NULL"); fflush(stdout); return; } endpos = UTF_END(u); utf_ptr = u->text; while (utf_ptr < endpos) { /* read next unicode character */ u2 c = utf_nextu2(&utf_ptr); if ((c >= 32) && (c <= 127)) printf("%c", c); else printf("?"); } fflush(stdout); } /* utf_display_printable_ascii_classname *************************************** Write utf symbol to stdout with `/' converted to `.' (for debugging purposes). Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ void utf_display_printable_ascii_classname(utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ if (u == NULL) { printf("NULL"); fflush(stdout); return; } endpos = UTF_END(u); utf_ptr = u->text; while (utf_ptr < endpos) { /* read next unicode character */ u2 c = utf_nextu2(&utf_ptr); if (c == '/') c = '.'; if ((c >= 32) && (c <= 127)) printf("%c", c); else printf("?"); } fflush(stdout); } /* utf_sprint_convert_to_latin1 ************************************************ Write utf symbol into c-string (for debugging purposes). Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield invalid results. *******************************************************************************/ void utf_sprint_convert_to_latin1(char *buffer, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ u2 pos = 0; /* position in c-string */ if (!u) { strcpy(buffer, "NULL"); return; } endpos = UTF_END(u); utf_ptr = u->text; while (utf_ptr < endpos) /* copy next unicode character */ buffer[pos++] = utf_nextu2(&utf_ptr); /* terminate string */ buffer[pos] = '\0'; } /* utf_sprint_convert_to_latin1_classname ************************************** Write utf symbol into c-string with `/' converted to `.' (for debugging purposes). Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield invalid results. *******************************************************************************/ void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ u2 pos = 0; /* position in c-string */ if (!u) { strcpy(buffer, "NULL"); return; } endpos = UTF_END(u); utf_ptr = u->text; while (utf_ptr < endpos) { /* copy next unicode character */ u2 c = utf_nextu2(&utf_ptr); if (c == '/') c = '.'; buffer[pos++] = c; } /* terminate string */ buffer[pos] = '\0'; } /* utf_strcat_convert_to_latin1 ************************************************ Like libc strcat, but uses an utf8 string. Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield invalid results. *******************************************************************************/ void utf_strcat_convert_to_latin1(char *buffer, utf *u) { utf_sprint_convert_to_latin1(buffer + strlen(buffer), u); } /* utf_strcat_convert_to_latin1_classname ************************************** Like libc strcat, but uses an utf8 string. Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield invalid results. *******************************************************************************/ void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u) { utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u); } /* utf_fprint_printable_ascii ************************************************** Write utf symbol into file. Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ void utf_fprint_printable_ascii(FILE *file, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ if (!u) return; endpos = UTF_END(u); utf_ptr = u->text; while (utf_ptr < endpos) { /* read next unicode character */ u2 c = utf_nextu2(&utf_ptr); if (c >= 32 && c <= 127) fprintf(file, "%c", c); else fprintf(file, "?"); } } /* utf_fprint_printable_ascii_classname **************************************** Write utf symbol into file with `/' converted to `.'. Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ void utf_fprint_printable_ascii_classname(FILE *file, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ if (!u) return; endpos = UTF_END(u); utf_ptr = u->text; while (utf_ptr < endpos) { /* read next unicode character */ u2 c = utf_nextu2(&utf_ptr); if (c == '/') c = '.'; if (c >= 32 && c <= 127) fprintf(file, "%c", c); else fprintf(file, "?"); } } /* is_valid_utf **************************************************************** Return true if the given string is a valid UTF-8 string. utf_ptr...points to first character end_pos...points after last character *******************************************************************************/ /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */ bool is_valid_utf(char *utf_ptr, char *end_pos) { int bytes; int len,i; char c; unsigned long v; if (end_pos < utf_ptr) return false; bytes = end_pos - utf_ptr; while (bytes--) { c = *utf_ptr++; if (!c) return false; /* 0x00 is not allowed */ if ((c & 0x80) == 0) continue; /* ASCII */ if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */ else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */ else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */ else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */ else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */ else return false; /* invalid leading byte */ if (len > 2) return false; /* Java limitation */ v = (unsigned long)c & (0x3f >> len); if ((bytes -= len) < 0) return false; /* missing bytes */ for (i = len; i--; ) { c = *utf_ptr++; if ((c & 0xc0) != 0x80) /* 10xx xxxx */ return false; v = (v << 6) | (c & 0x3f); } if (v == 0) { if (len != 1) return false; /* Java special */ } else { /* Sun Java seems to allow overlong UTF-8 encodings */ /* if (v < min_codepoint[len]) */ /* XXX throw exception? */ } /* surrogates in UTF-8 seem to be allowed in Java classfiles */ /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */ /* even these seem to be allowed */ /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */ } return true; } /* is_valid_name *************************************************************** Return true if the given string may be used as a class/field/method name. (Currently this only disallows empty strings and control characters.) NOTE: The string is assumed to have passed is_valid_utf! utf_ptr...points to first character end_pos...points after last character *******************************************************************************/ bool is_valid_name(char *utf_ptr, char *end_pos) { if (end_pos <= utf_ptr) return false; /* disallow empty names */ while (utf_ptr < end_pos) { unsigned char c = *utf_ptr++; if (c < 0x20) return false; /* disallow control characters */ if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */ return false; } return true; } bool is_valid_name_utf(utf *u) { return is_valid_name(u->text, UTF_END(u)); } /* utf_show ******************************************************************** Writes the utf symbols in the utfhash to stdout and displays the number of external hash chains grouped according to the chainlength (for debugging purposes). *******************************************************************************/ #if !defined(NDEBUG) void utf_show(void) { #define CHAIN_LIMIT 20 /* limit for seperated enumeration */ u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */ u4 max_chainlength = 0; /* maximum length of the chains */ u4 sum_chainlength = 0; /* sum of the chainlengths */ u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */ u4 i; printf("UTF-HASH:\n"); /* show element of utf-hashtable */ for (i = 0; i < hashtable_utf->size; i++) { utf *u = hashtable_utf->ptr[i]; if (u) { printf("SLOT %d: ", (int) i); while (u) { printf("'"); utf_display_printable_ascii(u); printf("' "); u = u->hashlink; } printf("\n"); } } printf("UTF-HASH: %d slots for %d entries\n", (int) hashtable_utf->size, (int) hashtable_utf->entries ); if (hashtable_utf->entries == 0) return; printf("chains:\n chainlength number of chains %% of utfstrings\n"); for (i=0;isize; i++) { utf *u = (utf*) hashtable_utf->ptr[i]; u4 chain_length = 0; /* determine chainlength */ while (u) { u = u->hashlink; chain_length++; } /* update sum of all chainlengths */ sum_chainlength+=chain_length; /* determine the maximum length of the chains */ if (chain_length>max_chainlength) max_chainlength = chain_length; /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */ if (chain_length>=CHAIN_LIMIT) { beyond_limit+=chain_length; chain_length=CHAIN_LIMIT-1; } /* update number of hashchains of current length */ chain_count[chain_length]++; } /* display results */ for (i=1;ientries)); printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries); printf("max. chainlength:%5d\n",max_chainlength); /* avg. chainlength = sum of chainlengths / number of chains */ printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0])); } #endif /* !defined(NDEBUG) */ /* * These are local overrides for various environment variables in Emacs. * Please do not remove this and leave it at the end of the file, where * Emacs will automagically detect them. * --------------------------------------------------------------------- * Local variables: * mode: c * indent-tabs-mode: t * c-basic-offset: 4 * tab-width: 4 * End: * vim:noexpandtab:sw=4:ts=4: */