1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
33 $Id: utf8.c 6216 2006-12-18 18:21:37Z twisti $
45 #include "mm/memory.h"
47 #if defined(ENABLE_THREADS)
48 # include "threads/native/lock.h"
50 # include "threads/none/lock.h"
53 #include "vm/builtin.h"
54 #include "vm/exceptions.h"
55 #include "vm/hashtable.h"
56 #include "vm/options.h"
57 #include "vm/statistics.h"
58 #include "vm/stringlocal.h"
62 /* global variables ***********************************************************/
64 /* hashsize must be power of 2 */
66 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
68 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
71 /* utf-symbols for pointer comparison of frequently used strings **************/
73 utf *utf_java_lang_Object;
75 utf *utf_java_lang_Class;
76 utf *utf_java_lang_ClassLoader;
77 utf *utf_java_lang_Cloneable;
78 utf *utf_java_lang_SecurityManager;
79 utf *utf_java_lang_String;
80 utf *utf_java_lang_System;
81 utf *utf_java_lang_ThreadGroup;
82 utf *utf_java_io_Serializable;
84 utf *utf_java_lang_Throwable;
85 utf *utf_java_lang_VMThrowable;
86 utf *utf_java_lang_Error;
87 utf *utf_java_lang_AbstractMethodError;
88 utf *utf_java_lang_LinkageError;
89 utf *utf_java_lang_NoClassDefFoundError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
93 utf *utf_java_lang_Exception;
94 utf *utf_java_lang_ClassCastException;
95 utf *utf_java_lang_ClassNotFoundException;
96 utf *utf_java_lang_IllegalArgumentException;
97 utf *utf_java_lang_IllegalMonitorStateException;
99 utf *utf_java_lang_NullPointerException;
101 utf* utf_java_lang_Void;
102 utf* utf_java_lang_Boolean;
103 utf* utf_java_lang_Byte;
104 utf* utf_java_lang_Character;
105 utf* utf_java_lang_Short;
106 utf* utf_java_lang_Integer;
107 utf* utf_java_lang_Long;
108 utf* utf_java_lang_Float;
109 utf* utf_java_lang_Double;
111 utf *utf_java_lang_StackTraceElement;
112 utf *utf_java_lang_reflect_Constructor;
113 utf *utf_java_lang_reflect_Field;
114 utf *utf_java_lang_reflect_Method;
115 utf *utf_java_util_Vector;
117 utf *utf_InnerClasses; /* InnerClasses */
118 utf *utf_ConstantValue; /* ConstantValue */
119 utf *utf_Code; /* Code */
120 utf *utf_Exceptions; /* Exceptions */
121 utf *utf_LineNumberTable; /* LineNumberTable */
122 utf *utf_SourceFile; /* SourceFile */
124 #if defined(ENABLE_JAVASE)
125 utf *utf_EnclosingMethod;
127 utf *utf_RuntimeVisibleAnnotations;
128 utf *utf_StackMapTable;
131 utf *utf_init; /* <init> */
132 utf *utf_clinit; /* <clinit> */
133 utf *utf_clone; /* clone */
134 utf *utf_finalize; /* finalize */
135 utf *utf_run; /* run */
139 utf *utf_removeThread;
144 utf *utf_fillInStackTrace;
145 utf *utf_getSystemClassLoader;
147 utf *utf_printStackTrace;
158 utf *utf_void__void; /* ()V */
159 utf *utf_boolean__void; /* (Z)V */
160 utf *utf_byte__void; /* (B)V */
161 utf *utf_char__void; /* (C)V */
162 utf *utf_short__void; /* (S)V */
163 utf *utf_int__void; /* (I)V */
164 utf *utf_long__void; /* (J)V */
165 utf *utf_float__void; /* (F)V */
166 utf *utf_double__void; /* (D)V */
168 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
169 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
170 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
171 utf *utf_java_lang_Object__java_lang_Object;
172 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
173 utf *utf_java_lang_String__java_lang_Class;
174 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
175 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
177 utf *utf_not_named_yet; /* special name for unnamed classes */
179 utf *array_packagename;
182 /* utf_init ********************************************************************
184 Initializes the utf8 subsystem.
186 *******************************************************************************/
190 /* create utf8 hashtable */
192 hashtable_utf = NEW(hashtable);
194 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
196 #if defined(ENABLE_STATISTICS)
198 count_utf_len += sizeof(utf*) * hashtable_utf->size;
201 /* create utf-symbols for pointer comparison of frequently used strings */
203 utf_java_lang_Object = utf_new_char("java/lang/Object");
205 utf_java_lang_Class = utf_new_char("java/lang/Class");
206 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
207 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
208 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
209 utf_java_lang_String = utf_new_char("java/lang/String");
210 utf_java_lang_System = utf_new_char("java/lang/System");
211 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
212 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
214 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
215 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
216 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
218 utf_java_lang_AbstractMethodError =
219 utf_new_char(string_java_lang_AbstractMethodError);
221 utf_java_lang_LinkageError =
222 utf_new_char(string_java_lang_LinkageError);
224 utf_java_lang_NoClassDefFoundError =
225 utf_new_char(string_java_lang_NoClassDefFoundError);
227 utf_java_lang_NoSuchMethodError =
228 utf_new_char(string_java_lang_NoSuchMethodError);
230 utf_java_lang_OutOfMemoryError =
231 utf_new_char(string_java_lang_OutOfMemoryError);
233 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
235 utf_java_lang_ClassCastException =
236 utf_new_char(string_java_lang_ClassCastException);
238 utf_java_lang_ClassNotFoundException =
239 utf_new_char(string_java_lang_ClassNotFoundException);
241 utf_java_lang_IllegalArgumentException =
242 utf_new_char(string_java_lang_IllegalArgumentException);
244 utf_java_lang_IllegalMonitorStateException =
245 utf_new_char(string_java_lang_IllegalMonitorStateException);
247 utf_java_lang_NullPointerException =
248 utf_new_char(string_java_lang_NullPointerException);
250 utf_java_lang_Void = utf_new_char("java/lang/Void");
251 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
252 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
253 utf_java_lang_Character = utf_new_char("java/lang/Character");
254 utf_java_lang_Short = utf_new_char("java/lang/Short");
255 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
256 utf_java_lang_Long = utf_new_char("java/lang/Long");
257 utf_java_lang_Float = utf_new_char("java/lang/Float");
258 utf_java_lang_Double = utf_new_char("java/lang/Double");
260 utf_java_lang_StackTraceElement =
261 utf_new_char("java/lang/StackTraceElement");
263 utf_java_lang_reflect_Constructor =
264 utf_new_char("java/lang/reflect/Constructor");
266 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
267 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
268 utf_java_util_Vector = utf_new_char("java/util/Vector");
270 utf_InnerClasses = utf_new_char("InnerClasses");
271 utf_ConstantValue = utf_new_char("ConstantValue");
272 utf_Code = utf_new_char("Code");
273 utf_Exceptions = utf_new_char("Exceptions");
274 utf_LineNumberTable = utf_new_char("LineNumberTable");
275 utf_SourceFile = utf_new_char("SourceFile");
277 #if defined(ENABLE_JAVASE)
278 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
279 utf_Signature = utf_new_char("Signature");
280 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
281 utf_StackMapTable = utf_new_char("StackMapTable");
284 utf_init = utf_new_char("<init>");
285 utf_clinit = utf_new_char("<clinit>");
286 utf_clone = utf_new_char("clone");
287 utf_finalize = utf_new_char("finalize");
288 utf_run = utf_new_char("run");
290 utf_add = utf_new_char("add");
291 utf_remove = utf_new_char("remove");
292 utf_removeThread = utf_new_char("removeThread");
293 utf_put = utf_new_char("put");
294 utf_get = utf_new_char("get");
295 utf_value = utf_new_char("value");
297 utf_printStackTrace = utf_new_char("printStackTrace");
298 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
299 utf_loadClass = utf_new_char("loadClass");
300 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
302 utf_Z = utf_new_char("Z");
303 utf_B = utf_new_char("B");
304 utf_C = utf_new_char("C");
305 utf_S = utf_new_char("S");
306 utf_I = utf_new_char("I");
307 utf_J = utf_new_char("J");
308 utf_F = utf_new_char("F");
309 utf_D = utf_new_char("D");
311 utf_void__void = utf_new_char("()V");
312 utf_boolean__void = utf_new_char("(Z)V");
313 utf_byte__void = utf_new_char("(B)V");
314 utf_char__void = utf_new_char("(C)V");
315 utf_short__void = utf_new_char("(S)V");
316 utf_int__void = utf_new_char("(I)V");
317 utf_long__void = utf_new_char("(J)V");
318 utf_float__void = utf_new_char("(F)V");
319 utf_double__void = utf_new_char("(D)V");
320 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
321 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
323 utf_void__java_lang_ClassLoader =
324 utf_new_char("()Ljava/lang/ClassLoader;");
326 utf_java_lang_Object__java_lang_Object =
327 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
329 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
331 utf_java_lang_String__java_lang_Class =
332 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
334 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
335 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
337 utf_null = utf_new_char("null");
338 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
339 array_packagename = utf_new_char("\t<the array package>");
341 /* everything's ok */
347 /* utf_hashkey *****************************************************************
349 The hashkey is computed from the utf-text by using up to 8
350 characters. For utf-symbols longer than 15 characters 3 characters
351 are taken from the beginning and the end, 2 characters are taken
354 *******************************************************************************/
356 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
357 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
359 u4 utf_hashkey(const char *text, u4 length)
361 const char *start_pos = text; /* pointer to utf text */
365 case 0: /* empty string */
368 case 1: return fbs(0);
369 case 2: return fbs(0) ^ nbs(3);
370 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
371 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
372 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
373 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
374 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
375 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
382 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
391 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
400 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
412 return a ^ nbs(9) ^ nbs(10);
424 return a ^ nbs(9) ^ nbs(10);
435 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
446 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
448 default: /* 3 characters from beginning */
454 /* 2 characters from middle */
455 text = start_pos + (length / 2);
460 /* 3 characters from end */
461 text = start_pos + length - 4;
466 return a ^ nbs(10) ^ nbs(11);
470 /* utf_full_hashkey ************************************************************
472 This function computes a hash value using all bytes in the string.
474 The algorithm is the "One-at-a-time" algorithm as published
475 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
477 *******************************************************************************/
479 u4 utf_full_hashkey(const char *text, u4 length)
481 register const unsigned char *p = (const unsigned char *) text;
489 hash += (hash << 10);
493 hash ^= (hash >> 11);
494 hash += (hash << 15);
499 /* unicode_hashkey *************************************************************
501 Compute the hashkey of a unicode string.
503 *******************************************************************************/
505 u4 unicode_hashkey(u2 *text, u2 len)
507 return utf_hashkey((char *) text, len);
511 /* utf_new *********************************************************************
513 Creates a new utf-symbol, the text of the symbol is passed as a
514 u1-array. The function searches the utf-hashtable for a utf-symbol
515 with this text. On success the element returned, otherwise a new
516 hashtable element is created.
518 If the number of entries in the hashtable exceeds twice the size of
519 the hashtable slots a reorganization of the hashtable is done and
520 the utf symbols are copied to a new hashtable with doubled size.
522 *******************************************************************************/
524 utf *utf_new(const char *text, u2 length)
526 u4 key; /* hashkey computed from utf-text */
527 u4 slot; /* slot in hashtable */
528 utf *u; /* hashtable element */
531 LOCK_MONITOR_ENTER(hashtable_utf->header);
533 #if defined(ENABLE_STATISTICS)
538 key = utf_hashkey(text, length);
539 slot = key & (hashtable_utf->size - 1);
540 u = hashtable_utf->ptr[slot];
542 /* search external hash chain for utf-symbol */
545 if (u->blength == length) {
546 /* compare text of hashtable elements */
548 for (i = 0; i < length; i++)
549 if (text[i] != u->text[i])
552 #if defined(ENABLE_STATISTICS)
554 count_utf_new_found++;
557 /* symbol found in hashtable */
559 LOCK_MONITOR_EXIT(hashtable_utf->header);
565 u = u->hashlink; /* next element in external chain */
568 #if defined(ENABLE_STATISTICS)
570 count_utf_len += sizeof(utf) + length + 1;
573 /* location in hashtable found, create new utf element */
575 u->blength = length; /* length in bytes of utfstring */
576 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
577 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
579 memcpy(u->text, text, length); /* copy utf-text */
580 u->text[length] = '\0';
582 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
583 hashtable_utf->entries++; /* update number of entries */
585 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
587 /* reorganization of hashtable, average length of the external
588 chains is approx. 2 */
590 hashtable *newhash; /* the new hashtable */
596 /* create new hashtable, double the size */
598 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
600 #if defined(ENABLE_STATISTICS)
602 count_utf_len += sizeof(utf*) * hashtable_utf->size;
605 /* transfer elements to new hashtable */
607 for (i = 0; i < hashtable_utf->size; i++) {
608 u = hashtable_utf->ptr[i];
612 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
614 u->hashlink = (utf *) newhash->ptr[slot];
615 newhash->ptr[slot] = u;
617 /* follow link in external hash chain */
623 /* dispose old table */
625 hashtable_free(hashtable_utf);
627 hashtable_utf = newhash;
630 LOCK_MONITOR_EXIT(hashtable_utf->header);
636 /* utf_new_u2 ******************************************************************
638 Make utf symbol from u2 array, if isclassname is true '.' is
641 *******************************************************************************/
643 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
645 char *buffer; /* memory buffer for unicode characters */
646 char *pos; /* pointer to current position in buffer */
647 u4 left; /* unicode characters left */
648 u4 buflength; /* utf length in bytes of the u2 array */
649 utf *result; /* resulting utf-string */
652 /* determine utf length in bytes and allocate memory */
654 buflength = u2_utflength(unicode_pos, unicode_length);
655 buffer = MNEW(char, buflength);
660 for (i = 0; i++ < unicode_length; unicode_pos++) {
661 /* next unicode character */
664 if ((c != 0) && (c < 0x80)) {
667 if ((int) left < 0) break;
668 /* convert classname */
669 if (isclassname && c == '.')
674 } else if (c < 0x800) {
676 unsigned char high = c >> 6;
677 unsigned char low = c & 0x3F;
679 if ((int) left < 0) break;
680 *pos++ = high | 0xC0;
686 char mid = (c >> 6) & 0x3F;
689 if ((int) left < 0) break;
690 *pos++ = high | 0xE0;
696 /* insert utf-string into symbol-table */
697 result = utf_new(buffer,buflength);
699 MFREE(buffer, char, buflength);
705 /* utf_new_char ****************************************************************
707 Creates a new utf symbol, the text for this symbol is passed as a
708 c-string ( = char* ).
710 *******************************************************************************/
712 utf *utf_new_char(const char *text)
714 return utf_new(text, strlen(text));
718 /* utf_new_char_classname ******************************************************
720 Creates a new utf symbol, the text for this symbol is passed as a
721 c-string ( = char* ) "." characters are going to be replaced by
722 "/". Since the above function is used often, this is a separte
723 function, instead of an if.
725 *******************************************************************************/
727 utf *utf_new_char_classname(const char *text)
729 if (strchr(text, '.')) {
730 char *txt = strdup(text);
731 char *end = txt + strlen(txt);
735 for (c = txt; c < end; c++)
736 if (*c == '.') *c = '/';
738 tmpRes = utf_new(txt, strlen(txt));
744 return utf_new(text, strlen(text));
748 /* utf_nextu2 ******************************************************************
750 Read the next unicode character from the utf string and increment
751 the utf-string pointer accordingly.
753 CAUTION: This function is unsafe for input that was not checked
756 *******************************************************************************/
758 u2 utf_nextu2(char **utf_ptr)
760 /* uncompressed unicode character */
762 /* current position in utf text */
763 unsigned char *utf = (unsigned char *) (*utf_ptr);
764 /* bytes representing the unicode character */
765 unsigned char ch1, ch2, ch3;
766 /* number of bytes used to represent the unicode character */
769 switch ((ch1 = utf[0]) >> 4) {
770 default: /* 1 byte */
774 case 0xD: /* 2 bytes */
775 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
776 unsigned char high = ch1 & 0x1F;
777 unsigned char low = ch2 & 0x3F;
778 unicode_char = (high << 6) + low;
783 case 0xE: /* 2 or 3 bytes */
784 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
785 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
786 unsigned char low = ch3 & 0x3f;
787 unsigned char mid = ch2 & 0x3f;
788 unsigned char high = ch1 & 0x0f;
789 unicode_char = (((high << 6) + mid) << 6) + low;
797 /* update position in utf-text */
798 *utf_ptr = (char *) (utf + len);
804 /* utf_bytes *******************************************************************
806 Determine number of bytes (aka. octets) in the utf string.
809 u............utf string
812 The number of octets of this utf string.
813 There is _no_ terminating zero included in this count.
815 *******************************************************************************/
822 /* utf_get_number_of_u2s_for_buffer ********************************************
824 Determine number of UTF-16 u2s in the given UTF-8 buffer
826 CAUTION: This function is unsafe for input that was not checked
829 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
830 to an array of u2s (UTF-16) and want to know how many of them you will get.
831 All other uses of this function are probably wrong.
834 buffer........points to first char in buffer
835 blength.......number of _bytes_ in the buffer
838 the number of u2s needed to hold this string in UTF-16 encoding.
839 There is _no_ terminating zero included in this count.
841 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
844 *******************************************************************************/
846 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
848 const char *endpos; /* points behind utf string */
849 const char *utf_ptr; /* current position in utf text */
850 u4 len = 0; /* number of unicode characters */
853 endpos = utf_ptr + blength;
855 while (utf_ptr < endpos) {
857 /* next unicode character */
858 utf_nextu2((char **)&utf_ptr);
861 assert(utf_ptr == endpos);
867 /* utf_get_number_of_u2s *******************************************************
869 Determine number of UTF-16 u2s in the utf string.
871 CAUTION: This function is unsafe for input that was not checked
874 CAUTION: Use this function *only* when you want to convert a utf string
875 to an array of u2s and want to know how many of them you will get.
876 All other uses of this function are probably wrong.
879 u............utf string
882 the number of u2s needed to hold this string in UTF-16 encoding.
883 There is _no_ terminating zero included in this count.
884 XXX 0 if a NullPointerException has been thrown (see below)
886 *******************************************************************************/
888 u4 utf_get_number_of_u2s(utf *u)
890 char *endpos; /* points behind utf string */
891 char *utf_ptr; /* current position in utf text */
892 u4 len = 0; /* number of unicode characters */
894 /* XXX this is probably not checked by most callers! Review this after */
895 /* the invalid uses of this function have been eliminated */
897 exceptions_throw_nullpointerexception();
904 while (utf_ptr < endpos) {
906 /* next unicode character */
907 utf_nextu2(&utf_ptr);
910 if (utf_ptr != endpos)
911 /* string ended abruptly */
912 throw_cacao_exception_exit(string_java_lang_InternalError,
913 "Illegal utf8 string");
919 /* utf8_safe_number_of_u2s *****************************************************
921 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
922 (For invalid UTF-8 the U+fffd replacement character will be counted.)
924 This function is safe even for invalid UTF-8 strings.
927 text..........zero-terminated(!) UTF-8 string (may be invalid)
929 nbytes........strlen(text). (This is needed to completely emulate
933 the number of u2s needed to hold this string in UTF-16 encoding.
934 There is _no_ terminating zero included in this count.
936 *******************************************************************************/
938 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
939 register const unsigned char *t;
942 register const unsigned char *tlimit;
953 t = (const unsigned char *) text;
956 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
962 /* highest bit set, non-ASCII character */
964 if ((byte & 0xe0) == 0xc0) {
965 /* 2-byte: should be 110..... 10...... ? */
967 if ((*t++ & 0xc0) == 0x80)
972 else if ((byte & 0xf0) == 0xe0) {
973 /* 3-byte: should be 1110.... 10...... 10...... */
977 return len + 1; /* invalid, stop here */
979 if ((*t++ & 0xc0) == 0x80) {
980 if ((*t++ & 0xc0) == 0x80)
988 else if ((byte & 0xf8) == 0xf0) {
989 /* 4-byte: should be 11110... 10...... 10...... 10...... */
993 return len + 1; /* invalid, stop here */
995 if (((byte1 = *t++) & 0xc0) == 0x80) {
996 if (((byte2 = *t++) & 0xc0) == 0x80) {
997 if (((byte3 = *t++) & 0xc0) == 0x80) {
998 /* valid 4-byte UTF-8? */
999 value = ((byte & 0x07) << 18)
1000 | ((byte1 & 0x3f) << 12)
1001 | ((byte2 & 0x3f) << 6)
1002 | ((byte3 & 0x3f) );
1004 if (value > 0x10FFFF)
1006 else if (value > 0xFFFF)
1007 len += 1; /* we need surrogates */
1009 ; /* 16bit suffice */
1020 else if ((byte & 0xfc) == 0xf8) {
1021 /* invalid 5-byte */
1023 return len + 1; /* invalid, stop here */
1026 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1029 else if ((byte & 0xfe) == 0xfc) {
1030 /* invalid 6-byte */
1032 return len + 1; /* invalid, stop here */
1035 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1047 /* ASCII character, common case */
1057 /* utf8_safe_convert_to_u2s ****************************************************
1059 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1060 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1061 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1063 This function is safe even for invalid UTF-8 strings.
1066 text..........zero-terminated(!) UTF-8 string (may be invalid)
1068 nbytes........strlen(text). (This is needed to completely emulate
1070 buffer........a preallocated array of u2s to receive the decoded
1071 string. Use utf8_safe_number_of_u2s to get the
1072 required number of u2s for allocating this.
1074 *******************************************************************************/
1076 #define UNICODE_REPLACEMENT 0xfffd
1078 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1079 register const unsigned char *t;
1081 register const unsigned char *tlimit;
1089 assert(nbytes >= 0);
1091 t = (const unsigned char *) text;
1092 tlimit = t + nbytes;
1094 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1100 /* highest bit set, non-ASCII character */
1102 if ((byte & 0xe0) == 0xc0) {
1103 /* 2-byte: should be 110..... 10...... */
1105 if (((byte1 = *t++) & 0xc0) == 0x80) {
1106 /* valid 2-byte UTF-8 */
1107 *buffer++ = ((byte & 0x1f) << 6)
1108 | ((byte1 & 0x3f) );
1111 *buffer++ = UNICODE_REPLACEMENT;
1115 else if ((byte & 0xf0) == 0xe0) {
1116 /* 3-byte: should be 1110.... 10...... 10...... */
1118 if (t + 2 > tlimit) {
1119 *buffer++ = UNICODE_REPLACEMENT;
1123 if (((byte1 = *t++) & 0xc0) == 0x80) {
1124 if (((byte2 = *t++) & 0xc0) == 0x80) {
1125 /* valid 3-byte UTF-8 */
1126 *buffer++ = ((byte & 0x0f) << 12)
1127 | ((byte1 & 0x3f) << 6)
1128 | ((byte2 & 0x3f) );
1131 *buffer++ = UNICODE_REPLACEMENT;
1136 *buffer++ = UNICODE_REPLACEMENT;
1140 else if ((byte & 0xf8) == 0xf0) {
1141 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1143 if (t + 3 > tlimit) {
1144 *buffer++ = UNICODE_REPLACEMENT;
1148 if (((byte1 = *t++) & 0xc0) == 0x80) {
1149 if (((byte2 = *t++) & 0xc0) == 0x80) {
1150 if (((byte3 = *t++) & 0xc0) == 0x80) {
1151 /* valid 4-byte UTF-8? */
1152 value = ((byte & 0x07) << 18)
1153 | ((byte1 & 0x3f) << 12)
1154 | ((byte2 & 0x3f) << 6)
1155 | ((byte3 & 0x3f) );
1157 if (value > 0x10FFFF) {
1158 *buffer++ = UNICODE_REPLACEMENT;
1160 else if (value > 0xFFFF) {
1161 /* we need surrogates */
1162 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1163 *buffer++ = 0xdc00 | (value & 0x03ff);
1166 *buffer++ = value; /* 16bit suffice */
1169 *buffer++ = UNICODE_REPLACEMENT;
1174 *buffer++ = UNICODE_REPLACEMENT;
1179 *buffer++ = UNICODE_REPLACEMENT;
1183 else if ((byte & 0xfc) == 0xf8) {
1184 if (t + 4 > tlimit) {
1185 *buffer++ = UNICODE_REPLACEMENT;
1190 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1192 *buffer++ = UNICODE_REPLACEMENT;
1194 else if ((byte & 0xfe) == 0xfc) {
1195 if (t + 5 > tlimit) {
1196 *buffer++ = UNICODE_REPLACEMENT;
1201 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1203 *buffer++ = UNICODE_REPLACEMENT;
1206 *buffer++ = UNICODE_REPLACEMENT;
1214 /* ASCII character, common case */
1222 /* u2_utflength ****************************************************************
1224 Returns the utf length in bytes of a u2 array.
1226 *******************************************************************************/
1228 u4 u2_utflength(u2 *text, u4 u2_length)
1230 u4 result_len = 0; /* utf length in bytes */
1231 u2 ch; /* current unicode character */
1234 for (len = 0; len < u2_length; len++) {
1235 /* next unicode character */
1238 /* determine bytes required to store unicode character as utf */
1239 if (ch && (ch < 0x80))
1241 else if (ch < 0x800)
1251 /* utf_copy ********************************************************************
1253 Copy the given utf string byte-for-byte to a buffer.
1256 buffer.......the buffer
1257 u............the utf string
1259 *******************************************************************************/
1261 void utf_copy(char *buffer, utf *u)
1263 /* our utf strings are zero-terminated (done by utf_new) */
1264 MCOPY(buffer, u->text, char, u->blength + 1);
1268 /* utf_cat *********************************************************************
1270 Append the given utf string byte-for-byte to a buffer.
1273 buffer.......the buffer
1274 u............the utf string
1276 *******************************************************************************/
1278 void utf_cat(char *buffer, utf *u)
1280 /* our utf strings are zero-terminated (done by utf_new) */
1281 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1285 /* utf_copy_classname **********************************************************
1287 Copy the given utf classname byte-for-byte to a buffer.
1288 '/' is replaced by '.'
1291 buffer.......the buffer
1292 u............the utf string
1294 *******************************************************************************/
1296 void utf_copy_classname(char *buffer, utf *u)
1305 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1307 while (srcptr != endptr) {
1316 /* utf_cat *********************************************************************
1318 Append the given utf classname byte-for-byte to a buffer.
1319 '/' is replaced by '.'
1322 buffer.......the buffer
1323 u............the utf string
1325 *******************************************************************************/
1327 void utf_cat_classname(char *buffer, utf *u)
1329 utf_copy_classname(buffer + strlen(buffer), u);
1332 /* utf_display_printable_ascii *************************************************
1334 Write utf symbol to stdout (for debugging purposes).
1335 Non-printable and non-ASCII characters are printed as '?'.
1337 *******************************************************************************/
1339 void utf_display_printable_ascii(utf *u)
1341 char *endpos; /* points behind utf string */
1342 char *utf_ptr; /* current position in utf text */
1350 endpos = UTF_END(u);
1353 while (utf_ptr < endpos) {
1354 /* read next unicode character */
1356 u2 c = utf_nextu2(&utf_ptr);
1358 if ((c >= 32) && (c <= 127))
1368 /* utf_display_printable_ascii_classname ***************************************
1370 Write utf symbol to stdout with `/' converted to `.' (for debugging
1372 Non-printable and non-ASCII characters are printed as '?'.
1374 *******************************************************************************/
1376 void utf_display_printable_ascii_classname(utf *u)
1378 char *endpos; /* points behind utf string */
1379 char *utf_ptr; /* current position in utf text */
1387 endpos = UTF_END(u);
1390 while (utf_ptr < endpos) {
1391 /* read next unicode character */
1393 u2 c = utf_nextu2(&utf_ptr);
1398 if ((c >= 32) && (c <= 127))
1408 /* utf_sprint_convert_to_latin1 ************************************************
1410 Write utf symbol into c-string (for debugging purposes).
1411 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1414 *******************************************************************************/
1416 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1418 char *endpos; /* points behind utf string */
1419 char *utf_ptr; /* current position in utf text */
1420 u2 pos = 0; /* position in c-string */
1423 strcpy(buffer, "NULL");
1427 endpos = UTF_END(u);
1430 while (utf_ptr < endpos)
1431 /* copy next unicode character */
1432 buffer[pos++] = utf_nextu2(&utf_ptr);
1434 /* terminate string */
1439 /* utf_sprint_convert_to_latin1_classname **************************************
1441 Write utf symbol into c-string with `/' converted to `.' (for debugging
1443 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1446 *******************************************************************************/
1448 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1450 char *endpos; /* points behind utf string */
1451 char *utf_ptr; /* current position in utf text */
1452 u2 pos = 0; /* position in c-string */
1455 strcpy(buffer, "NULL");
1459 endpos = UTF_END(u);
1462 while (utf_ptr < endpos) {
1463 /* copy next unicode character */
1464 u2 c = utf_nextu2(&utf_ptr);
1465 if (c == '/') c = '.';
1469 /* terminate string */
1474 /* utf_strcat_convert_to_latin1 ************************************************
1476 Like libc strcat, but uses an utf8 string.
1477 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1480 *******************************************************************************/
1482 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1484 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1488 /* utf_strcat_convert_to_latin1_classname **************************************
1490 Like libc strcat, but uses an utf8 string.
1491 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1494 *******************************************************************************/
1496 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1498 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1502 /* utf_fprint_printable_ascii **************************************************
1504 Write utf symbol into file.
1505 Non-printable and non-ASCII characters are printed as '?'.
1507 *******************************************************************************/
1509 void utf_fprint_printable_ascii(FILE *file, utf *u)
1511 char *endpos; /* points behind utf string */
1512 char *utf_ptr; /* current position in utf text */
1517 endpos = UTF_END(u);
1520 while (utf_ptr < endpos) {
1521 /* read next unicode character */
1522 u2 c = utf_nextu2(&utf_ptr);
1524 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1525 else fprintf(file, "?");
1530 /* utf_fprint_printable_ascii_classname ****************************************
1532 Write utf symbol into file with `/' converted to `.'.
1533 Non-printable and non-ASCII characters are printed as '?'.
1535 *******************************************************************************/
1537 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1539 char *endpos; /* points behind utf string */
1540 char *utf_ptr; /* current position in utf text */
1545 endpos = UTF_END(u);
1548 while (utf_ptr < endpos) {
1549 /* read next unicode character */
1550 u2 c = utf_nextu2(&utf_ptr);
1551 if (c == '/') c = '.';
1553 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1554 else fprintf(file, "?");
1559 /* is_valid_utf ****************************************************************
1561 Return true if the given string is a valid UTF-8 string.
1563 utf_ptr...points to first character
1564 end_pos...points after last character
1566 *******************************************************************************/
1568 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1570 bool is_valid_utf(char *utf_ptr, char *end_pos)
1577 if (end_pos < utf_ptr) return false;
1578 bytes = end_pos - utf_ptr;
1582 if (!c) return false; /* 0x00 is not allowed */
1583 if ((c & 0x80) == 0) continue; /* ASCII */
1585 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1586 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1587 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1588 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1589 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1590 else return false; /* invalid leading byte */
1592 if (len > 2) return false; /* Java limitation */
1594 v = (unsigned long)c & (0x3f >> len);
1596 if ((bytes -= len) < 0) return false; /* missing bytes */
1598 for (i = len; i--; ) {
1600 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1602 v = (v << 6) | (c & 0x3f);
1606 if (len != 1) return false; /* Java special */
1609 /* Sun Java seems to allow overlong UTF-8 encodings */
1611 /* if (v < min_codepoint[len]) */
1612 /* XXX throw exception? */
1615 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1616 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1618 /* even these seem to be allowed */
1619 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1626 /* is_valid_name ***************************************************************
1628 Return true if the given string may be used as a class/field/method
1629 name. (Currently this only disallows empty strings and control
1632 NOTE: The string is assumed to have passed is_valid_utf!
1634 utf_ptr...points to first character
1635 end_pos...points after last character
1637 *******************************************************************************/
1639 bool is_valid_name(char *utf_ptr, char *end_pos)
1641 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1643 while (utf_ptr < end_pos) {
1644 unsigned char c = *utf_ptr++;
1646 if (c < 0x20) return false; /* disallow control characters */
1647 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1654 bool is_valid_name_utf(utf *u)
1656 return is_valid_name(u->text, UTF_END(u));
1660 /* utf_show ********************************************************************
1662 Writes the utf symbols in the utfhash to stdout and displays the
1663 number of external hash chains grouped according to the chainlength
1664 (for debugging purposes).
1666 *******************************************************************************/
1668 #if !defined(NDEBUG)
1672 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1674 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1675 u4 max_chainlength = 0; /* maximum length of the chains */
1676 u4 sum_chainlength = 0; /* sum of the chainlengths */
1677 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1680 printf("UTF-HASH:\n");
1682 /* show element of utf-hashtable */
1684 for (i = 0; i < hashtable_utf->size; i++) {
1685 utf *u = hashtable_utf->ptr[i];
1688 printf("SLOT %d: ", (int) i);
1692 utf_display_printable_ascii(u);
1700 printf("UTF-HASH: %d slots for %d entries\n",
1701 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1703 if (hashtable_utf->entries == 0)
1706 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1708 for (i=0;i<CHAIN_LIMIT;i++)
1711 /* count numbers of hashchains according to their length */
1712 for (i=0; i<hashtable_utf->size; i++) {
1714 utf *u = (utf*) hashtable_utf->ptr[i];
1715 u4 chain_length = 0;
1717 /* determine chainlength */
1723 /* update sum of all chainlengths */
1724 sum_chainlength+=chain_length;
1726 /* determine the maximum length of the chains */
1727 if (chain_length>max_chainlength)
1728 max_chainlength = chain_length;
1730 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1731 if (chain_length>=CHAIN_LIMIT) {
1732 beyond_limit+=chain_length;
1733 chain_length=CHAIN_LIMIT-1;
1736 /* update number of hashchains of current length */
1737 chain_count[chain_length]++;
1740 /* display results */
1741 for (i=1;i<CHAIN_LIMIT-1;i++)
1742 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1744 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1747 printf("max. chainlength:%5d\n",max_chainlength);
1749 /* avg. chainlength = sum of chainlengths / number of chains */
1750 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1752 #endif /* !defined(NDEBUG) */
1756 * These are local overrides for various environment variables in Emacs.
1757 * Please do not remove this and leave it at the end of the file, where
1758 * Emacs will automagically detect them.
1759 * ---------------------------------------------------------------------
1762 * indent-tabs-mode: t
1766 * vim:noexpandtab:sw=4:ts=4: