1 /* src/vm/utf.c - utf functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
34 $Id: utf8.c 4879 2006-05-05 17:34:49Z edwin $
45 #include "mm/memory.h"
47 #if defined(USE_THREADS)
48 # if defined(NATIVE_THREADS)
49 # include "threads/native/threads.h"
51 # include "threads/green/threads.h"
55 #include "vm/builtin.h"
56 #include "vm/exceptions.h"
57 #include "vm/hashtable.h"
58 #include "vm/options.h"
59 #include "vm/statistics.h"
60 #include "vm/stringlocal.h"
63 /* global variables ***********************************************************/
65 /* hashsize must be power of 2 */
67 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
69 hashtable hashtable_utf; /* hashtable for utf8-symbols */
71 #if defined(USE_THREADS)
72 static java_objectheader *lock_hashtable_utf;
76 /* utf-symbols for pointer comparison of frequently used strings **************/
78 utf *utf_java_lang_Object;
80 utf *utf_java_lang_Class;
81 utf *utf_java_lang_ClassLoader;
82 utf *utf_java_lang_Cloneable;
83 utf *utf_java_lang_SecurityManager;
84 utf *utf_java_lang_String;
85 utf *utf_java_lang_System;
86 utf *utf_java_lang_ThreadGroup;
87 utf *utf_java_io_Serializable;
89 utf *utf_java_lang_Throwable;
90 utf *utf_java_lang_VMThrowable;
91 utf *utf_java_lang_Error;
92 utf *utf_java_lang_NoClassDefFoundError;
93 utf *utf_java_lang_LinkageError;
94 utf *utf_java_lang_NoSuchMethodError;
95 utf *utf_java_lang_OutOfMemoryError;
97 utf *utf_java_lang_Exception;
98 utf *utf_java_lang_ClassNotFoundException;
99 utf *utf_java_lang_IllegalArgumentException;
100 utf *utf_java_lang_IllegalMonitorStateException;
102 utf *utf_java_lang_NullPointerException;
104 utf* utf_java_lang_Void;
105 utf* utf_java_lang_Boolean;
106 utf* utf_java_lang_Byte;
107 utf* utf_java_lang_Character;
108 utf* utf_java_lang_Short;
109 utf* utf_java_lang_Integer;
110 utf* utf_java_lang_Long;
111 utf* utf_java_lang_Float;
112 utf* utf_java_lang_Double;
114 utf *utf_java_lang_StackTraceElement;
115 utf *utf_java_lang_reflect_Constructor;
116 utf *utf_java_lang_reflect_Field;
117 utf *utf_java_lang_reflect_Method;
118 utf *utf_java_util_Vector;
120 utf *utf_InnerClasses; /* InnerClasses */
121 utf *utf_ConstantValue; /* ConstantValue */
122 utf *utf_Code; /* Code */
123 utf *utf_Exceptions; /* Exceptions */
124 utf *utf_LineNumberTable; /* LineNumberTable */
125 utf *utf_SourceFile; /* SourceFile */
127 utf *utf_init; /* <init> */
128 utf *utf_clinit; /* <clinit> */
129 utf *utf_clone; /* clone */
130 utf *utf_finalize; /* finalize */
131 utf *utf_run; /* run */
133 utf *utf_add; /* add */
134 utf *utf_remove; /* remove */
135 utf *utf_put; /* put */
136 utf *utf_get; /* get */
137 utf *utf_value; /* value */
139 utf *utf_fillInStackTrace;
140 utf *utf_getSystemClassLoader;
142 utf *utf_printStackTrace;
153 utf *utf_void__void; /* ()V */
154 utf *utf_boolean__void; /* (Z)V */
155 utf *utf_byte__void; /* (B)V */
156 utf *utf_char__void; /* (C)V */
157 utf *utf_short__void; /* (S)V */
158 utf *utf_int__void; /* (I)V */
159 utf *utf_long__void; /* (J)V */
160 utf *utf_float__void; /* (F)V */
161 utf *utf_double__void; /* (D)V */
163 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
164 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
165 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
166 utf *utf_java_lang_Object__java_lang_Object;
167 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
168 utf *utf_java_lang_String__java_lang_Class;
169 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
171 utf *utf_not_named_yet; /* special name for unnamed classes */
173 utf *array_packagename;
176 /* utf_init ********************************************************************
178 Initializes the utf8 subsystem.
180 *******************************************************************************/
184 /* create utf8 hashtable */
186 hashtable_create(&hashtable_utf, HASHTABLE_UTF_SIZE);
188 #if defined(ENABLE_STATISTICS)
190 count_utf_len += sizeof(utf*) * hashtable_utf.size;
193 #if defined(USE_THREADS)
194 /* create utf hashtable lock object */
196 lock_hashtable_utf = NEW(java_objectheader);
198 # if defined(NATIVE_THREADS)
199 initObjectLock(lock_hashtable_utf);
203 /* create utf-symbols for pointer comparison of frequently used strings */
205 utf_java_lang_Object = utf_new_char("java/lang/Object");
207 utf_java_lang_Class = utf_new_char("java/lang/Class");
208 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
209 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
210 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
211 utf_java_lang_String = utf_new_char("java/lang/String");
212 utf_java_lang_System = utf_new_char("java/lang/System");
213 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
214 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
216 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
217 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
218 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
220 utf_java_lang_NoClassDefFoundError =
221 utf_new_char(string_java_lang_NoClassDefFoundError);
223 utf_java_lang_LinkageError =
224 utf_new_char(string_java_lang_LinkageError);
226 utf_java_lang_NoSuchMethodError =
227 utf_new_char(string_java_lang_NoSuchMethodError);
229 utf_java_lang_OutOfMemoryError =
230 utf_new_char(string_java_lang_OutOfMemoryError);
232 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
234 utf_java_lang_ClassNotFoundException =
235 utf_new_char(string_java_lang_ClassNotFoundException);
237 utf_java_lang_IllegalArgumentException =
238 utf_new_char(string_java_lang_IllegalArgumentException);
240 utf_java_lang_IllegalMonitorStateException =
241 utf_new_char(string_java_lang_IllegalMonitorStateException);
243 utf_java_lang_NullPointerException =
244 utf_new_char(string_java_lang_NullPointerException);
246 utf_java_lang_Void = utf_new_char("java/lang/Void");
247 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
248 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
249 utf_java_lang_Character = utf_new_char("java/lang/Character");
250 utf_java_lang_Short = utf_new_char("java/lang/Short");
251 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
252 utf_java_lang_Long = utf_new_char("java/lang/Long");
253 utf_java_lang_Float = utf_new_char("java/lang/Float");
254 utf_java_lang_Double = utf_new_char("java/lang/Double");
256 utf_java_lang_StackTraceElement =
257 utf_new_char("java/lang/StackTraceElement");
259 utf_java_lang_reflect_Constructor =
260 utf_new_char("java/lang/reflect/Constructor");
262 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
263 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
264 utf_java_util_Vector = utf_new_char("java/util/Vector");
266 utf_InnerClasses = utf_new_char("InnerClasses");
267 utf_ConstantValue = utf_new_char("ConstantValue");
268 utf_Code = utf_new_char("Code");
269 utf_Exceptions = utf_new_char("Exceptions");
270 utf_LineNumberTable = utf_new_char("LineNumberTable");
271 utf_SourceFile = utf_new_char("SourceFile");
273 utf_init = utf_new_char("<init>");
274 utf_clinit = utf_new_char("<clinit>");
275 utf_clone = utf_new_char("clone");
276 utf_finalize = utf_new_char("finalize");
277 utf_run = utf_new_char("run");
279 utf_add = utf_new_char("add");
280 utf_remove = utf_new_char("remove");
281 utf_put = utf_new_char("put");
282 utf_get = utf_new_char("get");
283 utf_value = utf_new_char("value");
285 utf_printStackTrace = utf_new_char("printStackTrace");
286 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
287 utf_loadClass = utf_new_char("loadClass");
288 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
290 utf_Z = utf_new_char("Z");
291 utf_B = utf_new_char("B");
292 utf_C = utf_new_char("C");
293 utf_S = utf_new_char("S");
294 utf_I = utf_new_char("I");
295 utf_J = utf_new_char("J");
296 utf_F = utf_new_char("F");
297 utf_D = utf_new_char("D");
299 utf_void__void = utf_new_char("()V");
300 utf_boolean__void = utf_new_char("(Z)V");
301 utf_byte__void = utf_new_char("(B)V");
302 utf_char__void = utf_new_char("(C)V");
303 utf_short__void = utf_new_char("(S)V");
304 utf_int__void = utf_new_char("(I)V");
305 utf_long__void = utf_new_char("(J)V");
306 utf_float__void = utf_new_char("(F)V");
307 utf_double__void = utf_new_char("(D)V");
308 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
309 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
311 utf_void__java_lang_ClassLoader =
312 utf_new_char("()Ljava/lang/ClassLoader;");
314 utf_java_lang_Object__java_lang_Object =
315 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
317 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
319 utf_java_lang_String__java_lang_Class =
320 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
322 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
324 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
326 array_packagename = utf_new_char("\t<the array package>");
328 /* everything's ok */
334 /* utf_hashkey *****************************************************************
336 The hashkey is computed from the utf-text by using up to 8
337 characters. For utf-symbols longer than 15 characters 3 characters
338 are taken from the beginning and the end, 2 characters are taken
341 *******************************************************************************/
343 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
344 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
346 u4 utf_hashkey(const char *text, u4 length)
348 const char *start_pos = text; /* pointer to utf text */
352 case 0: /* empty string */
355 case 1: return fbs(0);
356 case 2: return fbs(0) ^ nbs(3);
357 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
358 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
359 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
360 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
361 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
362 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
369 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
378 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
387 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
399 return a ^ nbs(9) ^ nbs(10);
411 return a ^ nbs(9) ^ nbs(10);
422 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
433 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
435 default: /* 3 characters from beginning */
441 /* 2 characters from middle */
442 text = start_pos + (length / 2);
447 /* 3 characters from end */
448 text = start_pos + length - 4;
453 return a ^ nbs(10) ^ nbs(11);
457 /* utf_full_hashkey ************************************************************
459 This function computes a hash value using all bytes in the string.
461 The algorithm is the "One-at-a-time" algorithm as published
462 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
464 *******************************************************************************/
466 u4 utf_full_hashkey(const char *text, u4 length)
468 register const unsigned char *p = (const unsigned char *) text;
476 hash += (hash << 10);
480 hash ^= (hash >> 11);
481 hash += (hash << 15);
486 /* unicode_hashkey *************************************************************
488 Compute the hashkey of a unicode string.
490 *******************************************************************************/
492 u4 unicode_hashkey(u2 *text, u2 len)
494 return utf_hashkey((char *) text, len);
498 /* utf_new *********************************************************************
500 Creates a new utf-symbol, the text of the symbol is passed as a
501 u1-array. The function searches the utf-hashtable for a utf-symbol
502 with this text. On success the element returned, otherwise a new
503 hashtable element is created.
505 If the number of entries in the hashtable exceeds twice the size of
506 the hashtable slots a reorganization of the hashtable is done and
507 the utf symbols are copied to a new hashtable with doubled size.
509 *******************************************************************************/
511 utf *utf_new(const char *text, u2 length)
513 u4 key; /* hashkey computed from utf-text */
514 u4 slot; /* slot in hashtable */
515 utf *u; /* hashtable element */
518 #if defined(USE_THREADS)
519 builtin_monitorenter(lock_hashtable_utf);
522 #if defined(ENABLE_STATISTICS)
527 key = utf_hashkey(text, length);
528 slot = key & (hashtable_utf.size - 1);
529 u = hashtable_utf.ptr[slot];
531 /* search external hash chain for utf-symbol */
534 if (u->blength == length) {
535 /* compare text of hashtable elements */
537 for (i = 0; i < length; i++)
538 if (text[i] != u->text[i])
541 #if defined(ENABLE_STATISTICS)
543 count_utf_new_found++;
546 /* symbol found in hashtable */
548 #if defined(USE_THREADS)
549 builtin_monitorexit(lock_hashtable_utf);
556 u = u->hashlink; /* next element in external chain */
559 #if defined(ENABLE_STATISTICS)
561 count_utf_len += sizeof(utf) + length + 1;
564 /* location in hashtable found, create new utf element */
566 u->blength = length; /* length in bytes of utfstring */
567 u->hashlink = hashtable_utf.ptr[slot]; /* link in external hashchain */
568 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
570 memcpy(u->text, text, length); /* copy utf-text */
571 u->text[length] = '\0';
573 hashtable_utf.ptr[slot] = u; /* insert symbol into table */
574 hashtable_utf.entries++; /* update number of entries */
576 if (hashtable_utf.entries > (hashtable_utf.size * 2)) {
578 /* reorganization of hashtable, average length of the external
579 chains is approx. 2 */
581 hashtable newhash; /* the new hashtable */
587 /* create new hashtable, double the size */
589 hashtable_create(&newhash, hashtable_utf.size * 2);
590 newhash.entries = hashtable_utf.entries;
592 #if defined(ENABLE_STATISTICS)
594 count_utf_len += sizeof(utf*) * hashtable_utf.size;
597 /* transfer elements to new hashtable */
599 for (i = 0; i < hashtable_utf.size; i++) {
600 u = hashtable_utf.ptr[i];
604 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
606 u->hashlink = (utf *) newhash.ptr[slot];
607 newhash.ptr[slot] = u;
609 /* follow link in external hash chain */
615 /* dispose old table */
617 MFREE(hashtable_utf.ptr, void*, hashtable_utf.size);
618 hashtable_utf = newhash;
621 #if defined(USE_THREADS)
622 builtin_monitorexit(lock_hashtable_utf);
629 /* utf_new_u2 ******************************************************************
631 Make utf symbol from u2 array, if isclassname is true '.' is
634 *******************************************************************************/
636 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
638 char *buffer; /* memory buffer for unicode characters */
639 char *pos; /* pointer to current position in buffer */
640 u4 left; /* unicode characters left */
641 u4 buflength; /* utf length in bytes of the u2 array */
642 utf *result; /* resulting utf-string */
645 /* determine utf length in bytes and allocate memory */
647 buflength = u2_utflength(unicode_pos, unicode_length);
648 buffer = MNEW(char, buflength);
653 for (i = 0; i++ < unicode_length; unicode_pos++) {
654 /* next unicode character */
657 if ((c != 0) && (c < 0x80)) {
660 if ((int) left < 0) break;
661 /* convert classname */
662 if (isclassname && c == '.')
667 } else if (c < 0x800) {
669 unsigned char high = c >> 6;
670 unsigned char low = c & 0x3F;
672 if ((int) left < 0) break;
673 *pos++ = high | 0xC0;
679 char mid = (c >> 6) & 0x3F;
682 if ((int) left < 0) break;
683 *pos++ = high | 0xE0;
689 /* insert utf-string into symbol-table */
690 result = utf_new(buffer,buflength);
692 MFREE(buffer, char, buflength);
698 /* utf_new_char ****************************************************************
700 Creates a new utf symbol, the text for this symbol is passed as a
701 c-string ( = char* ).
703 *******************************************************************************/
705 utf *utf_new_char(const char *text)
707 return utf_new(text, strlen(text));
711 /* utf_new_char_classname ******************************************************
713 Creates a new utf symbol, the text for this symbol is passed as a
714 c-string ( = char* ) "." characters are going to be replaced by
715 "/". Since the above function is used often, this is a separte
716 function, instead of an if.
718 *******************************************************************************/
720 utf *utf_new_char_classname(const char *text)
722 if (strchr(text, '.')) {
723 char *txt = strdup(text);
724 char *end = txt + strlen(txt);
728 for (c = txt; c < end; c++)
729 if (*c == '.') *c = '/';
731 tmpRes = utf_new(txt, strlen(txt));
737 return utf_new(text, strlen(text));
741 /* utf_nextu2 ******************************************************************
743 Read the next unicode character from the utf string and increment
744 the utf-string pointer accordingly.
746 *******************************************************************************/
748 u2 utf_nextu2(char **utf_ptr)
750 /* uncompressed unicode character */
752 /* current position in utf text */
753 unsigned char *utf = (unsigned char *) (*utf_ptr);
754 /* bytes representing the unicode character */
755 unsigned char ch1, ch2, ch3;
756 /* number of bytes used to represent the unicode character */
759 switch ((ch1 = utf[0]) >> 4) {
760 default: /* 1 byte */
764 case 0xD: /* 2 bytes */
765 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
766 unsigned char high = ch1 & 0x1F;
767 unsigned char low = ch2 & 0x3F;
768 unicode_char = (high << 6) + low;
773 case 0xE: /* 2 or 3 bytes */
774 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
775 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
776 unsigned char low = ch3 & 0x3f;
777 unsigned char mid = ch2 & 0x3f;
778 unsigned char high = ch1 & 0x0f;
779 unicode_char = (((high << 6) + mid) << 6) + low;
787 /* update position in utf-text */
788 *utf_ptr = (char *) (utf + len);
794 /* utf_bytes *******************************************************************
796 Determine number of bytes (aka. octets) in the utf string.
799 u............utf string
802 The number of octets of this utf string.
803 There is _no_ terminating zero included in this count.
805 *******************************************************************************/
812 /* utf_get_number_of_u2s_for_buffer ********************************************
814 Determine number of UTF-16 u2s in the given UTF-8 buffer
816 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
817 to an array of u2s (UTF-16) and want to know how many of them you will get.
818 All other uses of this function are probably wrong.
821 buffer........points to first char in buffer
822 blength.......number of _bytes_ in the buffer
825 the number of u2s needed to hold this string in UTF-16 encoding.
826 There is _no_ terminating zero included in this count.
828 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
831 *******************************************************************************/
833 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
835 const char *endpos; /* points behind utf string */
836 const char *utf_ptr; /* current position in utf text */
837 u4 len = 0; /* number of unicode characters */
840 endpos = utf_ptr + blength;
842 while (utf_ptr < endpos) {
844 /* next unicode character */
845 utf_nextu2((char **)&utf_ptr);
848 assert(utf_ptr == endpos);
854 /* utf_get_number_of_u2s *******************************************************
856 Determine number of UTF-16 u2s in the utf string.
858 CAUTION: Use this function *only* when you want to convert a utf string
859 to an array of u2s and want to know how many of them you will get.
860 All other uses of this function are probably wrong.
863 u............utf string
866 the number of u2s needed to hold this string in UTF-16 encoding.
867 There is _no_ terminating zero included in this count.
868 XXX 0 if a NullPointerException has been thrown (see below)
870 *******************************************************************************/
872 u4 utf_get_number_of_u2s(utf *u)
874 char *endpos; /* points behind utf string */
875 char *utf_ptr; /* current position in utf text */
876 u4 len = 0; /* number of unicode characters */
878 /* XXX this is probably not checked by most callers! Review this after */
879 /* the invalid uses of this function have been eliminated */
881 exceptions_throw_nullpointerexception();
888 while (utf_ptr < endpos) {
890 /* next unicode character */
891 utf_nextu2(&utf_ptr);
894 if (utf_ptr != endpos)
895 /* string ended abruptly */
896 throw_cacao_exception_exit(string_java_lang_InternalError,
897 "Illegal utf8 string");
903 /* u2_utflength ****************************************************************
905 Returns the utf length in bytes of a u2 array.
907 *******************************************************************************/
909 u4 u2_utflength(u2 *text, u4 u2_length)
911 u4 result_len = 0; /* utf length in bytes */
912 u2 ch; /* current unicode character */
915 for (len = 0; len < u2_length; len++) {
916 /* next unicode character */
919 /* determine bytes required to store unicode character as utf */
920 if (ch && (ch < 0x80))
932 /* utf_copy ********************************************************************
934 Copy the given utf string byte-for-byte to a buffer.
937 buffer.......the buffer
938 u............the utf string
940 *******************************************************************************/
942 void utf_copy(char *buffer, utf *u)
944 /* our utf strings are zero-terminated (done by utf_new) */
945 MCOPY(buffer, u->text, char, u->blength + 1);
949 /* utf_cat *********************************************************************
951 Append the given utf string byte-for-byte to a buffer.
954 buffer.......the buffer
955 u............the utf string
957 *******************************************************************************/
959 void utf_cat(char *buffer, utf *u)
961 /* our utf strings are zero-terminated (done by utf_new) */
962 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
966 /* utf_copy_classname **********************************************************
968 Copy the given utf classname byte-for-byte to a buffer.
969 '/' is replaced by '.'
972 buffer.......the buffer
973 u............the utf string
975 *******************************************************************************/
977 void utf_copy_classname(char *buffer, utf *u)
986 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
988 while (srcptr != endptr) {
997 /* utf_cat *********************************************************************
999 Append the given utf classname byte-for-byte to a buffer.
1000 '/' is replaced by '.'
1003 buffer.......the buffer
1004 u............the utf string
1006 *******************************************************************************/
1008 void utf_cat_classname(char *buffer, utf *u)
1010 utf_copy_classname(buffer + strlen(buffer), u);
1013 /* utf_display_printable_ascii *************************************************
1015 Write utf symbol to stdout (for debugging purposes).
1016 Non-printable and non-ASCII characters are printed as '?'.
1018 *******************************************************************************/
1020 void utf_display_printable_ascii(utf *u)
1022 char *endpos; /* points behind utf string */
1023 char *utf_ptr; /* current position in utf text */
1031 endpos = UTF_END(u);
1034 while (utf_ptr < endpos) {
1035 /* read next unicode character */
1037 u2 c = utf_nextu2(&utf_ptr);
1039 if ((c >= 32) && (c <= 127))
1049 /* utf_display_printable_ascii_classname ***************************************
1051 Write utf symbol to stdout with `/' converted to `.' (for debugging
1053 Non-printable and non-ASCII characters are printed as '?'.
1055 *******************************************************************************/
1057 void utf_display_printable_ascii_classname(utf *u)
1059 char *endpos; /* points behind utf string */
1060 char *utf_ptr; /* current position in utf text */
1068 endpos = UTF_END(u);
1071 while (utf_ptr < endpos) {
1072 /* read next unicode character */
1074 u2 c = utf_nextu2(&utf_ptr);
1079 if ((c >= 32) && (c <= 127))
1089 /* utf_sprint_convert_to_latin1 ************************************************
1091 Write utf symbol into c-string (for debugging purposes).
1092 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1095 *******************************************************************************/
1097 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1099 char *endpos; /* points behind utf string */
1100 char *utf_ptr; /* current position in utf text */
1101 u2 pos = 0; /* position in c-string */
1104 strcpy(buffer, "NULL");
1108 endpos = UTF_END(u);
1111 while (utf_ptr < endpos)
1112 /* copy next unicode character */
1113 buffer[pos++] = utf_nextu2(&utf_ptr);
1115 /* terminate string */
1120 /* utf_sprint_convert_to_latin1_classname **************************************
1122 Write utf symbol into c-string with `/' converted to `.' (for debugging
1124 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1127 *******************************************************************************/
1129 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1131 char *endpos; /* points behind utf string */
1132 char *utf_ptr; /* current position in utf text */
1133 u2 pos = 0; /* position in c-string */
1136 strcpy(buffer, "NULL");
1140 endpos = UTF_END(u);
1143 while (utf_ptr < endpos) {
1144 /* copy next unicode character */
1145 u2 c = utf_nextu2(&utf_ptr);
1146 if (c == '/') c = '.';
1150 /* terminate string */
1155 /* utf_strcat_convert_to_latin1 ************************************************
1157 Like libc strcat, but uses an utf8 string.
1158 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1161 *******************************************************************************/
1163 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1165 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1169 /* utf_strcat_convert_to_latin1_classname **************************************
1171 Like libc strcat, but uses an utf8 string.
1172 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1175 *******************************************************************************/
1177 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1179 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1183 /* utf_fprint_printable_ascii **************************************************
1185 Write utf symbol into file.
1186 Non-printable and non-ASCII characters are printed as '?'.
1188 *******************************************************************************/
1190 void utf_fprint_printable_ascii(FILE *file, utf *u)
1192 char *endpos; /* points behind utf string */
1193 char *utf_ptr; /* current position in utf text */
1198 endpos = UTF_END(u);
1201 while (utf_ptr < endpos) {
1202 /* read next unicode character */
1203 u2 c = utf_nextu2(&utf_ptr);
1205 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1206 else fprintf(file, "?");
1211 /* utf_fprint_printable_ascii_classname ****************************************
1213 Write utf symbol into file with `/' converted to `.'.
1214 Non-printable and non-ASCII characters are printed as '?'.
1216 *******************************************************************************/
1218 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1220 char *endpos; /* points behind utf string */
1221 char *utf_ptr; /* current position in utf text */
1226 endpos = UTF_END(u);
1229 while (utf_ptr < endpos) {
1230 /* read next unicode character */
1231 u2 c = utf_nextu2(&utf_ptr);
1232 if (c == '/') c = '.';
1234 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1235 else fprintf(file, "?");
1240 /* is_valid_utf ****************************************************************
1242 Return true if the given string is a valid UTF-8 string.
1244 utf_ptr...points to first character
1245 end_pos...points after last character
1247 *******************************************************************************/
1249 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1251 bool is_valid_utf(char *utf_ptr, char *end_pos)
1258 if (end_pos < utf_ptr) return false;
1259 bytes = end_pos - utf_ptr;
1263 if (!c) return false; /* 0x00 is not allowed */
1264 if ((c & 0x80) == 0) continue; /* ASCII */
1266 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1267 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1268 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1269 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1270 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1271 else return false; /* invalid leading byte */
1273 if (len > 2) return false; /* Java limitation */
1275 v = (unsigned long)c & (0x3f >> len);
1277 if ((bytes -= len) < 0) return false; /* missing bytes */
1279 for (i = len; i--; ) {
1281 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1283 v = (v << 6) | (c & 0x3f);
1287 if (len != 1) return false; /* Java special */
1290 /* Sun Java seems to allow overlong UTF-8 encodings */
1292 /* if (v < min_codepoint[len]) */
1293 /* XXX throw exception? */
1296 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1297 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1299 /* even these seem to be allowed */
1300 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1307 /* is_valid_name ***************************************************************
1309 Return true if the given string may be used as a class/field/method
1310 name. (Currently this only disallows empty strings and control
1313 NOTE: The string is assumed to have passed is_valid_utf!
1315 utf_ptr...points to first character
1316 end_pos...points after last character
1318 *******************************************************************************/
1320 bool is_valid_name(char *utf_ptr, char *end_pos)
1322 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1324 while (utf_ptr < end_pos) {
1325 unsigned char c = *utf_ptr++;
1327 if (c < 0x20) return false; /* disallow control characters */
1328 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1335 bool is_valid_name_utf(utf *u)
1337 return is_valid_name(u->text, UTF_END(u));
1341 /* utf_show ********************************************************************
1343 Writes the utf symbols in the utfhash to stdout and displays the
1344 number of external hash chains grouped according to the chainlength
1345 (for debugging purposes).
1347 *******************************************************************************/
1349 #if !defined(NDEBUG)
1353 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1355 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1356 u4 max_chainlength = 0; /* maximum length of the chains */
1357 u4 sum_chainlength = 0; /* sum of the chainlengths */
1358 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1361 printf("UTF-HASH:\n");
1363 /* show element of utf-hashtable */
1365 for (i = 0; i < hashtable_utf.size; i++) {
1366 utf *u = hashtable_utf.ptr[i];
1369 printf("SLOT %d: ", (int) i);
1373 utf_display_printable_ascii(u);
1381 printf("UTF-HASH: %d slots for %d entries\n",
1382 (int) hashtable_utf.size, (int) hashtable_utf.entries );
1384 if (hashtable_utf.entries == 0)
1387 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1389 for (i=0;i<CHAIN_LIMIT;i++)
1392 /* count numbers of hashchains according to their length */
1393 for (i=0; i<hashtable_utf.size; i++) {
1395 utf *u = (utf*) hashtable_utf.ptr[i];
1396 u4 chain_length = 0;
1398 /* determine chainlength */
1404 /* update sum of all chainlengths */
1405 sum_chainlength+=chain_length;
1407 /* determine the maximum length of the chains */
1408 if (chain_length>max_chainlength)
1409 max_chainlength = chain_length;
1411 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1412 if (chain_length>=CHAIN_LIMIT) {
1413 beyond_limit+=chain_length;
1414 chain_length=CHAIN_LIMIT-1;
1417 /* update number of hashchains of current length */
1418 chain_count[chain_length]++;
1421 /* display results */
1422 for (i=1;i<CHAIN_LIMIT-1;i++)
1423 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf.entries));
1425 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf.entries);
1428 printf("max. chainlength:%5d\n",max_chainlength);
1430 /* avg. chainlength = sum of chainlengths / number of chains */
1431 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf.size-chain_count[0]));
1433 #endif /* !defined(NDEBUG) */
1437 * These are local overrides for various environment variables in Emacs.
1438 * Please do not remove this and leave it at the end of the file, where
1439 * Emacs will automagically detect them.
1440 * ---------------------------------------------------------------------
1443 * indent-tabs-mode: t
1447 * vim:noexpandtab:sw=4:ts=4: