1 /* src/vm/utf.c - utf functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
33 $Id: utf8.c 4689 2006-03-27 11:15:44Z twisti $
44 #include "mm/memory.h"
46 #if defined(USE_THREADS)
47 # if defined(NATIVE_THREADS)
48 # include "threads/native/threads.h"
50 # include "threads/green/threads.h"
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
62 /* global variables ***********************************************************/
64 /* hashsize must be power of 2 */
66 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
68 hashtable hashtable_utf; /* hashtable for utf8-symbols */
70 #if defined(USE_THREADS)
71 static java_objectheader *lock_hashtable_utf;
75 /* utf-symbols for pointer comparison of frequently used strings **************/
77 utf *utf_java_lang_Object;
79 utf *utf_java_lang_Class;
80 utf *utf_java_lang_ClassLoader;
81 utf *utf_java_lang_Cloneable;
82 utf *utf_java_lang_SecurityManager;
83 utf *utf_java_lang_String;
84 utf *utf_java_lang_System;
85 utf *utf_java_lang_ThreadGroup;
86 utf *utf_java_io_Serializable;
88 utf *utf_java_lang_Throwable;
89 utf *utf_java_lang_VMThrowable;
90 utf *utf_java_lang_Error;
91 utf *utf_java_lang_NoClassDefFoundError;
92 utf *utf_java_lang_LinkageError;
93 utf *utf_java_lang_NoSuchMethodError;
94 utf *utf_java_lang_OutOfMemoryError;
96 utf *utf_java_lang_Exception;
97 utf *utf_java_lang_ClassNotFoundException;
98 utf *utf_java_lang_IllegalArgumentException;
99 utf *utf_java_lang_IllegalMonitorStateException;
101 utf *utf_java_lang_NullPointerException;
103 utf* utf_java_lang_Void;
104 utf* utf_java_lang_Boolean;
105 utf* utf_java_lang_Byte;
106 utf* utf_java_lang_Character;
107 utf* utf_java_lang_Short;
108 utf* utf_java_lang_Integer;
109 utf* utf_java_lang_Long;
110 utf* utf_java_lang_Float;
111 utf* utf_java_lang_Double;
113 utf *utf_java_lang_StackTraceElement;
114 utf *utf_java_lang_reflect_Constructor;
115 utf *utf_java_lang_reflect_Field;
116 utf *utf_java_lang_reflect_Method;
117 utf *utf_java_util_Vector;
119 utf *utf_InnerClasses; /* InnerClasses */
120 utf *utf_ConstantValue; /* ConstantValue */
121 utf *utf_Code; /* Code */
122 utf *utf_Exceptions; /* Exceptions */
123 utf *utf_LineNumberTable; /* LineNumberTable */
124 utf *utf_SourceFile; /* SourceFile */
126 utf *utf_init; /* <init> */
127 utf *utf_clinit; /* <clinit> */
128 utf *utf_clone; /* clone */
129 utf *utf_finalize; /* finalize */
130 utf *utf_run; /* run */
132 utf *utf_add; /* add */
133 utf *utf_remove; /* remove */
134 utf *utf_put; /* put */
135 utf *utf_get; /* get */
136 utf *utf_value; /* value */
138 utf *utf_fillInStackTrace;
139 utf *utf_getSystemClassLoader;
141 utf *utf_printStackTrace;
152 utf *utf_void__void; /* ()V */
153 utf *utf_boolean__void; /* (Z)V */
154 utf *utf_byte__void; /* (B)V */
155 utf *utf_char__void; /* (C)V */
156 utf *utf_short__void; /* (S)V */
157 utf *utf_int__void; /* (I)V */
158 utf *utf_long__void; /* (J)V */
159 utf *utf_float__void; /* (F)V */
160 utf *utf_double__void; /* (D)V */
162 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
163 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
164 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
165 utf *utf_java_lang_Object__java_lang_Object;
166 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
167 utf *utf_java_lang_String__java_lang_Class;
168 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
170 utf *utf_not_named_yet; /* special name for unnamed classes */
172 utf *array_packagename;
175 /* utf_init ********************************************************************
177 Initializes the utf8 subsystem.
179 *******************************************************************************/
183 /* create utf8 hashtable */
185 hashtable_create(&hashtable_utf, HASHTABLE_UTF_SIZE);
187 #if defined(ENABLE_STATISTICS)
189 count_utf_len += sizeof(utf*) * hashtable_utf.size;
192 #if defined(USE_THREADS)
193 /* create utf hashtable lock object */
195 lock_hashtable_utf = NEW(java_objectheader);
197 # if defined(NATIVE_THREADS)
198 initObjectLock(lock_hashtable_utf);
202 /* create utf-symbols for pointer comparison of frequently used strings */
204 utf_java_lang_Object = utf_new_char("java/lang/Object");
206 utf_java_lang_Class = utf_new_char("java/lang/Class");
207 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
208 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
209 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
210 utf_java_lang_String = utf_new_char("java/lang/String");
211 utf_java_lang_System = utf_new_char("java/lang/System");
212 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
213 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
215 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
216 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
217 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
219 utf_java_lang_NoClassDefFoundError =
220 utf_new_char(string_java_lang_NoClassDefFoundError);
222 utf_java_lang_LinkageError =
223 utf_new_char(string_java_lang_LinkageError);
225 utf_java_lang_NoSuchMethodError =
226 utf_new_char(string_java_lang_NoSuchMethodError);
228 utf_java_lang_OutOfMemoryError =
229 utf_new_char(string_java_lang_OutOfMemoryError);
231 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
233 utf_java_lang_ClassNotFoundException =
234 utf_new_char(string_java_lang_ClassNotFoundException);
236 utf_java_lang_IllegalArgumentException =
237 utf_new_char(string_java_lang_IllegalArgumentException);
239 utf_java_lang_IllegalMonitorStateException =
240 utf_new_char(string_java_lang_IllegalMonitorStateException);
242 utf_java_lang_NullPointerException =
243 utf_new_char(string_java_lang_NullPointerException);
245 utf_java_lang_Void = utf_new_char("java/lang/Void");
246 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
247 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
248 utf_java_lang_Character = utf_new_char("java/lang/Character");
249 utf_java_lang_Short = utf_new_char("java/lang/Short");
250 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
251 utf_java_lang_Long = utf_new_char("java/lang/Long");
252 utf_java_lang_Float = utf_new_char("java/lang/Float");
253 utf_java_lang_Double = utf_new_char("java/lang/Double");
255 utf_java_lang_StackTraceElement =
256 utf_new_char("java/lang/StackTraceElement");
258 utf_java_lang_reflect_Constructor =
259 utf_new_char("java/lang/reflect/Constructor");
261 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
262 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
263 utf_java_util_Vector = utf_new_char("java/util/Vector");
265 utf_InnerClasses = utf_new_char("InnerClasses");
266 utf_ConstantValue = utf_new_char("ConstantValue");
267 utf_Code = utf_new_char("Code");
268 utf_Exceptions = utf_new_char("Exceptions");
269 utf_LineNumberTable = utf_new_char("LineNumberTable");
270 utf_SourceFile = utf_new_char("SourceFile");
272 utf_init = utf_new_char("<init>");
273 utf_clinit = utf_new_char("<clinit>");
274 utf_clone = utf_new_char("clone");
275 utf_finalize = utf_new_char("finalize");
276 utf_run = utf_new_char("run");
278 utf_add = utf_new_char("add");
279 utf_remove = utf_new_char("remove");
280 utf_put = utf_new_char("put");
281 utf_get = utf_new_char("get");
282 utf_value = utf_new_char("value");
284 utf_printStackTrace = utf_new_char("printStackTrace");
285 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
286 utf_loadClass = utf_new_char("loadClass");
287 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
289 utf_Z = utf_new_char("Z");
290 utf_B = utf_new_char("B");
291 utf_C = utf_new_char("C");
292 utf_S = utf_new_char("S");
293 utf_I = utf_new_char("I");
294 utf_J = utf_new_char("J");
295 utf_F = utf_new_char("F");
296 utf_D = utf_new_char("D");
298 utf_void__void = utf_new_char("()V");
299 utf_boolean__void = utf_new_char("(Z)V");
300 utf_byte__void = utf_new_char("(B)V");
301 utf_char__void = utf_new_char("(C)V");
302 utf_short__void = utf_new_char("(S)V");
303 utf_int__void = utf_new_char("(I)V");
304 utf_long__void = utf_new_char("(J)V");
305 utf_float__void = utf_new_char("(F)V");
306 utf_double__void = utf_new_char("(D)V");
307 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
308 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
310 utf_void__java_lang_ClassLoader =
311 utf_new_char("()Ljava/lang/ClassLoader;");
313 utf_java_lang_Object__java_lang_Object =
314 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
316 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
318 utf_java_lang_String__java_lang_Class =
319 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
321 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
323 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
325 array_packagename = utf_new_char("\t<the array package>");
327 /* everything's ok */
333 /* utf_hashkey *****************************************************************
335 The hashkey is computed from the utf-text by using up to 8
336 characters. For utf-symbols longer than 15 characters 3 characters
337 are taken from the beginning and the end, 2 characters are taken
340 *******************************************************************************/
342 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
343 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
345 u4 utf_hashkey(const char *text, u4 length)
347 const char *start_pos = text; /* pointer to utf text */
351 case 0: /* empty string */
354 case 1: return fbs(0);
355 case 2: return fbs(0) ^ nbs(3);
356 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
357 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
358 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
359 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
360 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
361 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
368 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
377 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
386 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
398 return a ^ nbs(9) ^ nbs(10);
410 return a ^ nbs(9) ^ nbs(10);
421 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
432 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
434 default: /* 3 characters from beginning */
440 /* 2 characters from middle */
441 text = start_pos + (length / 2);
446 /* 3 characters from end */
447 text = start_pos + length - 4;
452 return a ^ nbs(10) ^ nbs(11);
456 /* utf_full_hashkey ************************************************************
458 This function computes a hash value using all bytes in the string.
460 The algorithm is the "One-at-a-time" algorithm as published
461 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
463 *******************************************************************************/
465 u4 utf_full_hashkey(const char *text, u4 length)
467 register const unsigned char *p = (const unsigned char *) text;
475 hash += (hash << 10);
479 hash ^= (hash >> 11);
480 hash += (hash << 15);
485 /* unicode_hashkey *************************************************************
487 Compute the hashkey of a unicode string.
489 *******************************************************************************/
491 u4 unicode_hashkey(u2 *text, u2 len)
493 return utf_hashkey((char *) text, len);
497 /* utf_new *********************************************************************
499 Creates a new utf-symbol, the text of the symbol is passed as a
500 u1-array. The function searches the utf-hashtable for a utf-symbol
501 with this text. On success the element returned, otherwise a new
502 hashtable element is created.
504 If the number of entries in the hashtable exceeds twice the size of
505 the hashtable slots a reorganization of the hashtable is done and
506 the utf symbols are copied to a new hashtable with doubled size.
508 *******************************************************************************/
510 utf *utf_new(const char *text, u2 length)
512 u4 key; /* hashkey computed from utf-text */
513 u4 slot; /* slot in hashtable */
514 utf *u; /* hashtable element */
517 #if defined(USE_THREADS)
518 builtin_monitorenter(lock_hashtable_utf);
521 #if defined(ENABLE_STATISTICS)
526 key = utf_hashkey(text, length);
527 slot = key & (hashtable_utf.size - 1);
528 u = hashtable_utf.ptr[slot];
530 /* search external hash chain for utf-symbol */
533 if (u->blength == length) {
534 /* compare text of hashtable elements */
536 for (i = 0; i < length; i++)
537 if (text[i] != u->text[i])
540 #if defined(ENABLE_STATISTICS)
542 count_utf_new_found++;
545 /* symbol found in hashtable */
547 #if defined(USE_THREADS)
548 builtin_monitorexit(lock_hashtable_utf);
555 u = u->hashlink; /* next element in external chain */
558 #if defined(ENABLE_STATISTICS)
560 count_utf_len += sizeof(utf) + length + 1;
563 /* location in hashtable found, create new utf element */
565 u->blength = length; /* length in bytes of utfstring */
566 u->hashlink = hashtable_utf.ptr[slot]; /* link in external hashchain */
567 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
569 memcpy(u->text, text, length); /* copy utf-text */
570 u->text[length] = '\0';
572 hashtable_utf.ptr[slot] = u; /* insert symbol into table */
573 hashtable_utf.entries++; /* update number of entries */
575 if (hashtable_utf.entries > (hashtable_utf.size * 2)) {
577 /* reorganization of hashtable, average length of the external
578 chains is approx. 2 */
580 hashtable newhash; /* the new hashtable */
586 /* create new hashtable, double the size */
588 hashtable_create(&newhash, hashtable_utf.size * 2);
589 newhash.entries = hashtable_utf.entries;
591 #if defined(ENABLE_STATISTICS)
593 count_utf_len += sizeof(utf*) * hashtable_utf.size;
596 /* transfer elements to new hashtable */
598 for (i = 0; i < hashtable_utf.size; i++) {
599 u = hashtable_utf.ptr[i];
603 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
605 u->hashlink = (utf *) newhash.ptr[slot];
606 newhash.ptr[slot] = u;
608 /* follow link in external hash chain */
614 /* dispose old table */
616 MFREE(hashtable_utf.ptr, void*, hashtable_utf.size);
617 hashtable_utf = newhash;
620 #if defined(USE_THREADS)
621 builtin_monitorexit(lock_hashtable_utf);
628 /* utf_new_u2 ******************************************************************
630 Make utf symbol from u2 array, if isclassname is true '.' is
633 *******************************************************************************/
635 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
637 char *buffer; /* memory buffer for unicode characters */
638 char *pos; /* pointer to current position in buffer */
639 u4 left; /* unicode characters left */
640 u4 buflength; /* utf length in bytes of the u2 array */
641 utf *result; /* resulting utf-string */
644 /* determine utf length in bytes and allocate memory */
646 buflength = u2_utflength(unicode_pos, unicode_length);
647 buffer = MNEW(char, buflength);
652 for (i = 0; i++ < unicode_length; unicode_pos++) {
653 /* next unicode character */
656 if ((c != 0) && (c < 0x80)) {
659 if ((int) left < 0) break;
660 /* convert classname */
661 if (isclassname && c == '.')
666 } else if (c < 0x800) {
668 unsigned char high = c >> 6;
669 unsigned char low = c & 0x3F;
671 if ((int) left < 0) break;
672 *pos++ = high | 0xC0;
678 char mid = (c >> 6) & 0x3F;
681 if ((int) left < 0) break;
682 *pos++ = high | 0xE0;
688 /* insert utf-string into symbol-table */
689 result = utf_new(buffer,buflength);
691 MFREE(buffer, char, buflength);
697 /* utf_new_char ****************************************************************
699 Creates a new utf symbol, the text for this symbol is passed as a
700 c-string ( = char* ).
702 *******************************************************************************/
704 utf *utf_new_char(const char *text)
706 return utf_new(text, strlen(text));
710 /* utf_new_char_classname ******************************************************
712 Creates a new utf symbol, the text for this symbol is passed as a
713 c-string ( = char* ) "." characters are going to be replaced by
714 "/". Since the above function is used often, this is a separte
715 function, instead of an if.
717 *******************************************************************************/
719 utf *utf_new_char_classname(const char *text)
721 if (strchr(text, '.')) {
722 char *txt = strdup(text);
723 char *end = txt + strlen(txt);
727 for (c = txt; c < end; c++)
728 if (*c == '.') *c = '/';
730 tmpRes = utf_new(txt, strlen(txt));
736 return utf_new(text, strlen(text));
740 /* utf_nextu2 ******************************************************************
742 Read the next unicode character from the utf string and increment
743 the utf-string pointer accordingly.
745 *******************************************************************************/
747 u2 utf_nextu2(char **utf_ptr)
749 /* uncompressed unicode character */
751 /* current position in utf text */
752 unsigned char *utf = (unsigned char *) (*utf_ptr);
753 /* bytes representing the unicode character */
754 unsigned char ch1, ch2, ch3;
755 /* number of bytes used to represent the unicode character */
758 switch ((ch1 = utf[0]) >> 4) {
759 default: /* 1 byte */
763 case 0xD: /* 2 bytes */
764 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
765 unsigned char high = ch1 & 0x1F;
766 unsigned char low = ch2 & 0x3F;
767 unicode_char = (high << 6) + low;
772 case 0xE: /* 2 or 3 bytes */
773 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
774 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
775 unsigned char low = ch3 & 0x3f;
776 unsigned char mid = ch2 & 0x3f;
777 unsigned char high = ch1 & 0x0f;
778 unicode_char = (((high << 6) + mid) << 6) + low;
786 /* update position in utf-text */
787 *utf_ptr = (char *) (utf + len);
793 /* utf_strlen ******************************************************************
795 Determine number of unicode characters in the utf string.
797 *******************************************************************************/
799 u4 utf_strlen(utf *u)
801 char *endpos; /* points behind utf string */
802 char *utf_ptr; /* current position in utf text */
803 u4 len = 0; /* number of unicode characters */
806 exceptions_throw_nullpointerexception();
813 while (utf_ptr < endpos) {
815 /* next unicode character */
816 utf_nextu2(&utf_ptr);
819 if (utf_ptr != endpos)
820 /* string ended abruptly */
821 throw_cacao_exception_exit(string_java_lang_InternalError,
822 "Illegal utf8 string");
828 /* u2_utflength ****************************************************************
830 Returns the utf length in bytes of a u2 array.
832 *******************************************************************************/
834 u4 u2_utflength(u2 *text, u4 u2_length)
836 u4 result_len = 0; /* utf length in bytes */
837 u2 ch; /* current unicode character */
840 for (len = 0; len < u2_length; len++) {
841 /* next unicode character */
844 /* determine bytes required to store unicode character as utf */
845 if (ch && (ch < 0x80))
857 /* utf_display *****************************************************************
859 Write utf symbol to stdout (for debugging purposes).
861 *******************************************************************************/
863 void utf_display(utf *u)
865 char *endpos; /* points behind utf string */
866 char *utf_ptr; /* current position in utf text */
877 while (utf_ptr < endpos) {
878 /* read next unicode character */
880 u2 c = utf_nextu2(&utf_ptr);
882 if ((c >= 32) && (c <= 127))
892 /* utf_display_classname *******************************************************
894 Write utf symbol to stdout with `/' converted to `.' (for debugging
897 *******************************************************************************/
899 void utf_display_classname(utf *u)
901 char *endpos; /* points behind utf string */
902 char *utf_ptr; /* current position in utf text */
913 while (utf_ptr < endpos) {
914 /* read next unicode character */
916 u2 c = utf_nextu2(&utf_ptr);
921 if ((c >= 32) && (c <= 127))
931 /* utf_sprint ******************************************************************
933 Write utf symbol into c-string (for debugging purposes).
935 *******************************************************************************/
937 void utf_sprint(char *buffer, utf *u)
939 char *endpos; /* points behind utf string */
940 char *utf_ptr; /* current position in utf text */
941 u2 pos = 0; /* position in c-string */
944 strcpy(buffer, "NULL");
951 while (utf_ptr < endpos)
952 /* copy next unicode character */
953 buffer[pos++] = utf_nextu2(&utf_ptr);
955 /* terminate string */
960 /* utf_sprint_classname ********************************************************
962 Write utf symbol into c-string with `/' converted to `.' (for debugging
965 *******************************************************************************/
967 void utf_sprint_classname(char *buffer, utf *u)
969 char *endpos; /* points behind utf string */
970 char *utf_ptr; /* current position in utf text */
971 u2 pos = 0; /* position in c-string */
974 strcpy(buffer, "NULL");
981 while (utf_ptr < endpos) {
982 /* copy next unicode character */
983 u2 c = utf_nextu2(&utf_ptr);
984 if (c == '/') c = '.';
988 /* terminate string */
993 /* utf_strcat ******************************************************************
995 Like libc strcat, but uses an utf8 string.
997 *******************************************************************************/
999 void utf_strcat(char *buffer, utf *u)
1001 utf_sprint(buffer + strlen(buffer), u);
1005 /* utf_strcat_classname ********************************************************
1007 Like libc strcat, but uses an utf8 string.
1009 *******************************************************************************/
1011 void utf_strcat_classname(char *buffer, utf *u)
1013 utf_sprint_classname(buffer + strlen(buffer), u);
1017 /* utf_fprint ******************************************************************
1019 Write utf symbol into file.
1021 *******************************************************************************/
1023 void utf_fprint(FILE *file, utf *u)
1025 char *endpos; /* points behind utf string */
1026 char *utf_ptr; /* current position in utf text */
1031 endpos = UTF_END(u);
1034 while (utf_ptr < endpos) {
1035 /* read next unicode character */
1036 u2 c = utf_nextu2(&utf_ptr);
1038 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1039 else fprintf(file, "?");
1044 /* utf_fprint_classname ********************************************************
1046 Write utf symbol into file with `/' converted to `.'.
1048 *******************************************************************************/
1050 void utf_fprint_classname(FILE *file, utf *u)
1052 char *endpos; /* points behind utf string */
1053 char *utf_ptr; /* current position in utf text */
1058 endpos = UTF_END(u);
1061 while (utf_ptr < endpos) {
1062 /* read next unicode character */
1063 u2 c = utf_nextu2(&utf_ptr);
1064 if (c == '/') c = '.';
1066 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1067 else fprintf(file, "?");
1072 /* is_valid_utf ****************************************************************
1074 Return true if the given string is a valid UTF-8 string.
1076 utf_ptr...points to first character
1077 end_pos...points after last character
1079 *******************************************************************************/
1081 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1083 bool is_valid_utf(char *utf_ptr, char *end_pos)
1090 if (end_pos < utf_ptr) return false;
1091 bytes = end_pos - utf_ptr;
1095 if (!c) return false; /* 0x00 is not allowed */
1096 if ((c & 0x80) == 0) continue; /* ASCII */
1098 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1099 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1100 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1101 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1102 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1103 else return false; /* invalid leading byte */
1105 if (len > 2) return false; /* Java limitation */
1107 v = (unsigned long)c & (0x3f >> len);
1109 if ((bytes -= len) < 0) return false; /* missing bytes */
1111 for (i = len; i--; ) {
1113 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1115 v = (v << 6) | (c & 0x3f);
1119 if (len != 1) return false; /* Java special */
1122 /* Sun Java seems to allow overlong UTF-8 encodings */
1124 /* if (v < min_codepoint[len]) */
1125 /* XXX throw exception? */
1128 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1129 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1131 /* even these seem to be allowed */
1132 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1139 /* is_valid_name ***************************************************************
1141 Return true if the given string may be used as a class/field/method
1142 name. (Currently this only disallows empty strings and control
1145 NOTE: The string is assumed to have passed is_valid_utf!
1147 utf_ptr...points to first character
1148 end_pos...points after last character
1150 *******************************************************************************/
1152 bool is_valid_name(char *utf_ptr, char *end_pos)
1154 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1156 while (utf_ptr < end_pos) {
1157 unsigned char c = *utf_ptr++;
1159 if (c < 0x20) return false; /* disallow control characters */
1160 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1167 bool is_valid_name_utf(utf *u)
1169 return is_valid_name(u->text, UTF_END(u));
1173 /* utf_show ********************************************************************
1175 Writes the utf symbols in the utfhash to stdout and displays the
1176 number of external hash chains grouped according to the chainlength
1177 (for debugging purposes).
1179 *******************************************************************************/
1181 #if !defined(NDEBUG)
1185 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1187 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1188 u4 max_chainlength = 0; /* maximum length of the chains */
1189 u4 sum_chainlength = 0; /* sum of the chainlengths */
1190 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1193 printf("UTF-HASH:\n");
1195 /* show element of utf-hashtable */
1197 for (i = 0; i < hashtable_utf.size; i++) {
1198 utf *u = hashtable_utf.ptr[i];
1201 printf("SLOT %d: ", (int) i);
1213 printf("UTF-HASH: %d slots for %d entries\n",
1214 (int) hashtable_utf.size, (int) hashtable_utf.entries );
1216 if (hashtable_utf.entries == 0)
1219 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1221 for (i=0;i<CHAIN_LIMIT;i++)
1224 /* count numbers of hashchains according to their length */
1225 for (i=0; i<hashtable_utf.size; i++) {
1227 utf *u = (utf*) hashtable_utf.ptr[i];
1228 u4 chain_length = 0;
1230 /* determine chainlength */
1236 /* update sum of all chainlengths */
1237 sum_chainlength+=chain_length;
1239 /* determine the maximum length of the chains */
1240 if (chain_length>max_chainlength)
1241 max_chainlength = chain_length;
1243 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1244 if (chain_length>=CHAIN_LIMIT) {
1245 beyond_limit+=chain_length;
1246 chain_length=CHAIN_LIMIT-1;
1249 /* update number of hashchains of current length */
1250 chain_count[chain_length]++;
1253 /* display results */
1254 for (i=1;i<CHAIN_LIMIT-1;i++)
1255 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf.entries));
1257 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf.entries);
1260 printf("max. chainlength:%5d\n",max_chainlength);
1262 /* avg. chainlength = sum of chainlengths / number of chains */
1263 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf.size-chain_count[0]));
1265 #endif /* !defined(NDEBUG) */
1269 * These are local overrides for various environment variables in Emacs.
1270 * Please do not remove this and leave it at the end of the file, where
1271 * Emacs will automagically detect them.
1272 * ---------------------------------------------------------------------
1275 * indent-tabs-mode: t