1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
33 $Id: utf8.c 6244 2006-12-27 15:15:31Z twisti $
45 #include "mm/memory.h"
47 #if defined(ENABLE_THREADS)
48 # include "threads/native/lock.h"
50 # include "threads/none/lock.h"
53 #include "vm/builtin.h"
54 #include "vm/exceptions.h"
55 #include "vm/hashtable.h"
56 #include "vm/options.h"
57 #include "vm/statistics.h"
58 #include "vm/stringlocal.h"
62 /* global variables ***********************************************************/
64 /* hashsize must be power of 2 */
66 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
68 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
71 /* utf-symbols for pointer comparison of frequently used strings **************/
73 utf *utf_java_lang_Object;
75 utf *utf_java_lang_Class;
76 utf *utf_java_lang_ClassLoader;
77 utf *utf_java_lang_Cloneable;
78 utf *utf_java_lang_SecurityManager;
79 utf *utf_java_lang_String;
80 utf *utf_java_lang_System;
81 utf *utf_java_lang_ThreadGroup;
82 utf *utf_java_io_Serializable;
84 utf *utf_java_lang_Throwable;
85 utf *utf_java_lang_Error;
86 utf *utf_java_lang_LinkageError;
87 utf *utf_java_lang_NoClassDefFoundError;
88 utf *utf_java_lang_OutOfMemoryError;
89 utf *utf_java_lang_VirtualMachineError;
91 #if defined(ENABLE_JAVASE)
92 utf *utf_java_lang_AbstractMethodError;
93 utf *utf_java_lang_NoSuchMethodError;
96 #if defined(WITH_CLASSPATH_GNU)
97 utf *utf_java_lang_VMThrowable;
100 utf *utf_java_lang_Exception;
101 utf *utf_java_lang_ClassCastException;
102 utf *utf_java_lang_ClassNotFoundException;
103 utf *utf_java_lang_IllegalArgumentException;
104 utf *utf_java_lang_IllegalMonitorStateException;
106 utf *utf_java_lang_NullPointerException;
108 #if defined(ENABLE_JAVASE)
109 utf* utf_java_lang_Void;
112 utf* utf_java_lang_Boolean;
113 utf* utf_java_lang_Byte;
114 utf* utf_java_lang_Character;
115 utf* utf_java_lang_Short;
116 utf* utf_java_lang_Integer;
117 utf* utf_java_lang_Long;
118 utf* utf_java_lang_Float;
119 utf* utf_java_lang_Double;
121 #if defined(ENABLE_JAVASE)
122 utf *utf_java_lang_StackTraceElement;
123 utf *utf_java_lang_reflect_Constructor;
124 utf *utf_java_lang_reflect_Field;
125 utf *utf_java_lang_reflect_Method;
126 utf *utf_java_util_Vector;
129 utf *utf_InnerClasses; /* InnerClasses */
130 utf *utf_ConstantValue; /* ConstantValue */
131 utf *utf_Code; /* Code */
132 utf *utf_Exceptions; /* Exceptions */
133 utf *utf_LineNumberTable; /* LineNumberTable */
134 utf *utf_SourceFile; /* SourceFile */
136 #if defined(ENABLE_JAVASE)
137 utf *utf_EnclosingMethod;
139 utf *utf_RuntimeVisibleAnnotations;
140 utf *utf_StackMapTable;
143 utf *utf_init; /* <init> */
144 utf *utf_clinit; /* <clinit> */
145 utf *utf_clone; /* clone */
146 utf *utf_finalize; /* finalize */
147 utf *utf_run; /* run */
152 utf *utf_removeThread;
157 utf *utf_fillInStackTrace;
158 utf *utf_getSystemClassLoader;
160 utf *utf_printStackTrace;
171 utf *utf_void__void; /* ()V */
172 utf *utf_boolean__void; /* (Z)V */
173 utf *utf_byte__void; /* (B)V */
174 utf *utf_char__void; /* (C)V */
175 utf *utf_short__void; /* (S)V */
176 utf *utf_int__void; /* (I)V */
177 utf *utf_long__void; /* (J)V */
178 utf *utf_float__void; /* (F)V */
179 utf *utf_double__void; /* (D)V */
181 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
182 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
183 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
184 utf *utf_java_lang_Object__java_lang_Object;
185 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
186 utf *utf_java_lang_String__java_lang_Class;
187 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
188 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
190 utf *utf_not_named_yet; /* special name for unnamed classes */
192 utf *array_packagename;
195 /* utf_init ********************************************************************
197 Initializes the utf8 subsystem.
199 *******************************************************************************/
203 /* create utf8 hashtable */
205 hashtable_utf = NEW(hashtable);
207 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
209 #if defined(ENABLE_STATISTICS)
211 count_utf_len += sizeof(utf*) * hashtable_utf->size;
214 /* create utf-symbols for pointer comparison of frequently used strings */
216 utf_java_lang_Object = utf_new_char("java/lang/Object");
218 utf_java_lang_Class = utf_new_char("java/lang/Class");
219 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
220 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
221 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
222 utf_java_lang_String = utf_new_char("java/lang/String");
223 utf_java_lang_System = utf_new_char("java/lang/System");
224 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
225 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
227 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
228 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
230 utf_java_lang_LinkageError =
231 utf_new_char(string_java_lang_LinkageError);
233 utf_java_lang_NoClassDefFoundError =
234 utf_new_char(string_java_lang_NoClassDefFoundError);
236 utf_java_lang_OutOfMemoryError =
237 utf_new_char(string_java_lang_OutOfMemoryError);
239 utf_java_lang_VirtualMachineError =
240 utf_new_char(string_java_lang_VirtualMachineError);
242 #if defined(ENABLE_JAVASE)
243 utf_java_lang_AbstractMethodError =
244 utf_new_char(string_java_lang_AbstractMethodError);
246 utf_java_lang_NoSuchMethodError =
247 utf_new_char(string_java_lang_NoSuchMethodError);
250 #if defined(WITH_CLASSPATH_GNU)
251 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
254 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
256 utf_java_lang_ClassCastException =
257 utf_new_char(string_java_lang_ClassCastException);
259 utf_java_lang_ClassNotFoundException =
260 utf_new_char(string_java_lang_ClassNotFoundException);
262 utf_java_lang_IllegalArgumentException =
263 utf_new_char(string_java_lang_IllegalArgumentException);
265 utf_java_lang_IllegalMonitorStateException =
266 utf_new_char(string_java_lang_IllegalMonitorStateException);
268 utf_java_lang_NullPointerException =
269 utf_new_char(string_java_lang_NullPointerException);
271 #if defined(ENABLE_JAVASE)
272 utf_java_lang_Void = utf_new_char("java/lang/Void");
275 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
276 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
277 utf_java_lang_Character = utf_new_char("java/lang/Character");
278 utf_java_lang_Short = utf_new_char("java/lang/Short");
279 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
280 utf_java_lang_Long = utf_new_char("java/lang/Long");
281 utf_java_lang_Float = utf_new_char("java/lang/Float");
282 utf_java_lang_Double = utf_new_char("java/lang/Double");
284 #if defined(ENABLE_JAVASE)
285 utf_java_lang_StackTraceElement =
286 utf_new_char("java/lang/StackTraceElement");
288 utf_java_lang_reflect_Constructor =
289 utf_new_char("java/lang/reflect/Constructor");
291 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
292 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
293 utf_java_util_Vector = utf_new_char("java/util/Vector");
296 utf_InnerClasses = utf_new_char("InnerClasses");
297 utf_ConstantValue = utf_new_char("ConstantValue");
298 utf_Code = utf_new_char("Code");
299 utf_Exceptions = utf_new_char("Exceptions");
300 utf_LineNumberTable = utf_new_char("LineNumberTable");
301 utf_SourceFile = utf_new_char("SourceFile");
303 #if defined(ENABLE_JAVASE)
304 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
305 utf_Signature = utf_new_char("Signature");
306 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
307 utf_StackMapTable = utf_new_char("StackMapTable");
310 utf_init = utf_new_char("<init>");
311 utf_clinit = utf_new_char("<clinit>");
312 utf_clone = utf_new_char("clone");
313 utf_finalize = utf_new_char("finalize");
314 utf_run = utf_new_char("run");
316 utf_add = utf_new_char("add");
317 utf_remove = utf_new_char("remove");
318 utf_addThread = utf_new_char("addThread");
319 utf_removeThread = utf_new_char("removeThread");
320 utf_put = utf_new_char("put");
321 utf_get = utf_new_char("get");
322 utf_value = utf_new_char("value");
324 utf_printStackTrace = utf_new_char("printStackTrace");
325 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
326 utf_loadClass = utf_new_char("loadClass");
327 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
329 utf_Z = utf_new_char("Z");
330 utf_B = utf_new_char("B");
331 utf_C = utf_new_char("C");
332 utf_S = utf_new_char("S");
333 utf_I = utf_new_char("I");
334 utf_J = utf_new_char("J");
335 utf_F = utf_new_char("F");
336 utf_D = utf_new_char("D");
338 utf_void__void = utf_new_char("()V");
339 utf_boolean__void = utf_new_char("(Z)V");
340 utf_byte__void = utf_new_char("(B)V");
341 utf_char__void = utf_new_char("(C)V");
342 utf_short__void = utf_new_char("(S)V");
343 utf_int__void = utf_new_char("(I)V");
344 utf_long__void = utf_new_char("(J)V");
345 utf_float__void = utf_new_char("(F)V");
346 utf_double__void = utf_new_char("(D)V");
347 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
348 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
350 utf_void__java_lang_ClassLoader =
351 utf_new_char("()Ljava/lang/ClassLoader;");
353 utf_java_lang_Object__java_lang_Object =
354 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
356 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
358 utf_java_lang_String__java_lang_Class =
359 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
361 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
362 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
364 utf_null = utf_new_char("null");
365 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
366 array_packagename = utf_new_char("\t<the array package>");
368 /* everything's ok */
374 /* utf_hashkey *****************************************************************
376 The hashkey is computed from the utf-text by using up to 8
377 characters. For utf-symbols longer than 15 characters 3 characters
378 are taken from the beginning and the end, 2 characters are taken
381 *******************************************************************************/
383 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
384 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
386 u4 utf_hashkey(const char *text, u4 length)
388 const char *start_pos = text; /* pointer to utf text */
392 case 0: /* empty string */
395 case 1: return fbs(0);
396 case 2: return fbs(0) ^ nbs(3);
397 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
398 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
399 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
400 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
401 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
402 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
409 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
418 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
427 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
439 return a ^ nbs(9) ^ nbs(10);
451 return a ^ nbs(9) ^ nbs(10);
462 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
473 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
475 default: /* 3 characters from beginning */
481 /* 2 characters from middle */
482 text = start_pos + (length / 2);
487 /* 3 characters from end */
488 text = start_pos + length - 4;
493 return a ^ nbs(10) ^ nbs(11);
497 /* utf_full_hashkey ************************************************************
499 This function computes a hash value using all bytes in the string.
501 The algorithm is the "One-at-a-time" algorithm as published
502 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
504 *******************************************************************************/
506 u4 utf_full_hashkey(const char *text, u4 length)
508 register const unsigned char *p = (const unsigned char *) text;
516 hash += (hash << 10);
520 hash ^= (hash >> 11);
521 hash += (hash << 15);
526 /* unicode_hashkey *************************************************************
528 Compute the hashkey of a unicode string.
530 *******************************************************************************/
532 u4 unicode_hashkey(u2 *text, u2 len)
534 return utf_hashkey((char *) text, len);
538 /* utf_new *********************************************************************
540 Creates a new utf-symbol, the text of the symbol is passed as a
541 u1-array. The function searches the utf-hashtable for a utf-symbol
542 with this text. On success the element returned, otherwise a new
543 hashtable element is created.
545 If the number of entries in the hashtable exceeds twice the size of
546 the hashtable slots a reorganization of the hashtable is done and
547 the utf symbols are copied to a new hashtable with doubled size.
549 *******************************************************************************/
551 utf *utf_new(const char *text, u2 length)
553 u4 key; /* hashkey computed from utf-text */
554 u4 slot; /* slot in hashtable */
555 utf *u; /* hashtable element */
558 LOCK_MONITOR_ENTER(hashtable_utf->header);
560 #if defined(ENABLE_STATISTICS)
565 key = utf_hashkey(text, length);
566 slot = key & (hashtable_utf->size - 1);
567 u = hashtable_utf->ptr[slot];
569 /* search external hash chain for utf-symbol */
572 if (u->blength == length) {
573 /* compare text of hashtable elements */
575 for (i = 0; i < length; i++)
576 if (text[i] != u->text[i])
579 #if defined(ENABLE_STATISTICS)
581 count_utf_new_found++;
584 /* symbol found in hashtable */
586 LOCK_MONITOR_EXIT(hashtable_utf->header);
592 u = u->hashlink; /* next element in external chain */
595 #if defined(ENABLE_STATISTICS)
597 count_utf_len += sizeof(utf) + length + 1;
600 /* location in hashtable found, create new utf element */
602 u->blength = length; /* length in bytes of utfstring */
603 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
604 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
606 memcpy(u->text, text, length); /* copy utf-text */
607 u->text[length] = '\0';
609 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
610 hashtable_utf->entries++; /* update number of entries */
612 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
614 /* reorganization of hashtable, average length of the external
615 chains is approx. 2 */
617 hashtable *newhash; /* the new hashtable */
623 /* create new hashtable, double the size */
625 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
627 #if defined(ENABLE_STATISTICS)
629 count_utf_len += sizeof(utf*) * hashtable_utf->size;
632 /* transfer elements to new hashtable */
634 for (i = 0; i < hashtable_utf->size; i++) {
635 u = hashtable_utf->ptr[i];
639 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
641 u->hashlink = (utf *) newhash->ptr[slot];
642 newhash->ptr[slot] = u;
644 /* follow link in external hash chain */
650 /* dispose old table */
652 hashtable_free(hashtable_utf);
654 hashtable_utf = newhash;
657 LOCK_MONITOR_EXIT(hashtable_utf->header);
663 /* utf_new_u2 ******************************************************************
665 Make utf symbol from u2 array, if isclassname is true '.' is
668 *******************************************************************************/
670 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
672 char *buffer; /* memory buffer for unicode characters */
673 char *pos; /* pointer to current position in buffer */
674 u4 left; /* unicode characters left */
675 u4 buflength; /* utf length in bytes of the u2 array */
676 utf *result; /* resulting utf-string */
679 /* determine utf length in bytes and allocate memory */
681 buflength = u2_utflength(unicode_pos, unicode_length);
682 buffer = MNEW(char, buflength);
687 for (i = 0; i++ < unicode_length; unicode_pos++) {
688 /* next unicode character */
691 if ((c != 0) && (c < 0x80)) {
694 if ((int) left < 0) break;
695 /* convert classname */
696 if (isclassname && c == '.')
701 } else if (c < 0x800) {
703 unsigned char high = c >> 6;
704 unsigned char low = c & 0x3F;
706 if ((int) left < 0) break;
707 *pos++ = high | 0xC0;
713 char mid = (c >> 6) & 0x3F;
716 if ((int) left < 0) break;
717 *pos++ = high | 0xE0;
723 /* insert utf-string into symbol-table */
724 result = utf_new(buffer,buflength);
726 MFREE(buffer, char, buflength);
732 /* utf_new_char ****************************************************************
734 Creates a new utf symbol, the text for this symbol is passed as a
735 c-string ( = char* ).
737 *******************************************************************************/
739 utf *utf_new_char(const char *text)
741 return utf_new(text, strlen(text));
745 /* utf_new_char_classname ******************************************************
747 Creates a new utf symbol, the text for this symbol is passed as a
748 c-string ( = char* ) "." characters are going to be replaced by
749 "/". Since the above function is used often, this is a separte
750 function, instead of an if.
752 *******************************************************************************/
754 utf *utf_new_char_classname(const char *text)
756 if (strchr(text, '.')) {
757 char *txt = strdup(text);
758 char *end = txt + strlen(txt);
762 for (c = txt; c < end; c++)
763 if (*c == '.') *c = '/';
765 tmpRes = utf_new(txt, strlen(txt));
771 return utf_new(text, strlen(text));
775 /* utf_nextu2 ******************************************************************
777 Read the next unicode character from the utf string and increment
778 the utf-string pointer accordingly.
780 CAUTION: This function is unsafe for input that was not checked
783 *******************************************************************************/
785 u2 utf_nextu2(char **utf_ptr)
787 /* uncompressed unicode character */
789 /* current position in utf text */
790 unsigned char *utf = (unsigned char *) (*utf_ptr);
791 /* bytes representing the unicode character */
792 unsigned char ch1, ch2, ch3;
793 /* number of bytes used to represent the unicode character */
796 switch ((ch1 = utf[0]) >> 4) {
797 default: /* 1 byte */
801 case 0xD: /* 2 bytes */
802 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
803 unsigned char high = ch1 & 0x1F;
804 unsigned char low = ch2 & 0x3F;
805 unicode_char = (high << 6) + low;
810 case 0xE: /* 2 or 3 bytes */
811 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
812 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
813 unsigned char low = ch3 & 0x3f;
814 unsigned char mid = ch2 & 0x3f;
815 unsigned char high = ch1 & 0x0f;
816 unicode_char = (((high << 6) + mid) << 6) + low;
824 /* update position in utf-text */
825 *utf_ptr = (char *) (utf + len);
831 /* utf_bytes *******************************************************************
833 Determine number of bytes (aka. octets) in the utf string.
836 u............utf string
839 The number of octets of this utf string.
840 There is _no_ terminating zero included in this count.
842 *******************************************************************************/
849 /* utf_get_number_of_u2s_for_buffer ********************************************
851 Determine number of UTF-16 u2s in the given UTF-8 buffer
853 CAUTION: This function is unsafe for input that was not checked
856 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
857 to an array of u2s (UTF-16) and want to know how many of them you will get.
858 All other uses of this function are probably wrong.
861 buffer........points to first char in buffer
862 blength.......number of _bytes_ in the buffer
865 the number of u2s needed to hold this string in UTF-16 encoding.
866 There is _no_ terminating zero included in this count.
868 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
871 *******************************************************************************/
873 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
875 const char *endpos; /* points behind utf string */
876 const char *utf_ptr; /* current position in utf text */
877 u4 len = 0; /* number of unicode characters */
880 endpos = utf_ptr + blength;
882 while (utf_ptr < endpos) {
884 /* next unicode character */
885 utf_nextu2((char **)&utf_ptr);
888 assert(utf_ptr == endpos);
894 /* utf_get_number_of_u2s *******************************************************
896 Determine number of UTF-16 u2s in the utf string.
898 CAUTION: This function is unsafe for input that was not checked
901 CAUTION: Use this function *only* when you want to convert a utf string
902 to an array of u2s and want to know how many of them you will get.
903 All other uses of this function are probably wrong.
906 u............utf string
909 the number of u2s needed to hold this string in UTF-16 encoding.
910 There is _no_ terminating zero included in this count.
911 XXX 0 if a NullPointerException has been thrown (see below)
913 *******************************************************************************/
915 u4 utf_get_number_of_u2s(utf *u)
917 char *endpos; /* points behind utf string */
918 char *utf_ptr; /* current position in utf text */
919 u4 len = 0; /* number of unicode characters */
921 /* XXX this is probably not checked by most callers! Review this after */
922 /* the invalid uses of this function have been eliminated */
924 exceptions_throw_nullpointerexception();
931 while (utf_ptr < endpos) {
933 /* next unicode character */
934 utf_nextu2(&utf_ptr);
937 if (utf_ptr != endpos)
938 /* string ended abruptly */
939 throw_cacao_exception_exit(string_java_lang_InternalError,
940 "Illegal utf8 string");
946 /* utf8_safe_number_of_u2s *****************************************************
948 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
949 (For invalid UTF-8 the U+fffd replacement character will be counted.)
951 This function is safe even for invalid UTF-8 strings.
954 text..........zero-terminated(!) UTF-8 string (may be invalid)
956 nbytes........strlen(text). (This is needed to completely emulate
960 the number of u2s needed to hold this string in UTF-16 encoding.
961 There is _no_ terminating zero included in this count.
963 *******************************************************************************/
965 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
966 register const unsigned char *t;
969 register const unsigned char *tlimit;
980 t = (const unsigned char *) text;
983 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
989 /* highest bit set, non-ASCII character */
991 if ((byte & 0xe0) == 0xc0) {
992 /* 2-byte: should be 110..... 10...... ? */
994 if ((*t++ & 0xc0) == 0x80)
999 else if ((byte & 0xf0) == 0xe0) {
1000 /* 3-byte: should be 1110.... 10...... 10...... */
1004 return len + 1; /* invalid, stop here */
1006 if ((*t++ & 0xc0) == 0x80) {
1007 if ((*t++ & 0xc0) == 0x80)
1008 ; /* valid 3-byte */
1015 else if ((byte & 0xf8) == 0xf0) {
1016 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1020 return len + 1; /* invalid, stop here */
1022 if (((byte1 = *t++) & 0xc0) == 0x80) {
1023 if (((byte2 = *t++) & 0xc0) == 0x80) {
1024 if (((byte3 = *t++) & 0xc0) == 0x80) {
1025 /* valid 4-byte UTF-8? */
1026 value = ((byte & 0x07) << 18)
1027 | ((byte1 & 0x3f) << 12)
1028 | ((byte2 & 0x3f) << 6)
1029 | ((byte3 & 0x3f) );
1031 if (value > 0x10FFFF)
1033 else if (value > 0xFFFF)
1034 len += 1; /* we need surrogates */
1036 ; /* 16bit suffice */
1047 else if ((byte & 0xfc) == 0xf8) {
1048 /* invalid 5-byte */
1050 return len + 1; /* invalid, stop here */
1053 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1056 else if ((byte & 0xfe) == 0xfc) {
1057 /* invalid 6-byte */
1059 return len + 1; /* invalid, stop here */
1062 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1074 /* ASCII character, common case */
1084 /* utf8_safe_convert_to_u2s ****************************************************
1086 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1087 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1088 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1090 This function is safe even for invalid UTF-8 strings.
1093 text..........zero-terminated(!) UTF-8 string (may be invalid)
1095 nbytes........strlen(text). (This is needed to completely emulate
1097 buffer........a preallocated array of u2s to receive the decoded
1098 string. Use utf8_safe_number_of_u2s to get the
1099 required number of u2s for allocating this.
1101 *******************************************************************************/
1103 #define UNICODE_REPLACEMENT 0xfffd
1105 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1106 register const unsigned char *t;
1108 register const unsigned char *tlimit;
1116 assert(nbytes >= 0);
1118 t = (const unsigned char *) text;
1119 tlimit = t + nbytes;
1121 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1127 /* highest bit set, non-ASCII character */
1129 if ((byte & 0xe0) == 0xc0) {
1130 /* 2-byte: should be 110..... 10...... */
1132 if (((byte1 = *t++) & 0xc0) == 0x80) {
1133 /* valid 2-byte UTF-8 */
1134 *buffer++ = ((byte & 0x1f) << 6)
1135 | ((byte1 & 0x3f) );
1138 *buffer++ = UNICODE_REPLACEMENT;
1142 else if ((byte & 0xf0) == 0xe0) {
1143 /* 3-byte: should be 1110.... 10...... 10...... */
1145 if (t + 2 > tlimit) {
1146 *buffer++ = UNICODE_REPLACEMENT;
1150 if (((byte1 = *t++) & 0xc0) == 0x80) {
1151 if (((byte2 = *t++) & 0xc0) == 0x80) {
1152 /* valid 3-byte UTF-8 */
1153 *buffer++ = ((byte & 0x0f) << 12)
1154 | ((byte1 & 0x3f) << 6)
1155 | ((byte2 & 0x3f) );
1158 *buffer++ = UNICODE_REPLACEMENT;
1163 *buffer++ = UNICODE_REPLACEMENT;
1167 else if ((byte & 0xf8) == 0xf0) {
1168 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1170 if (t + 3 > tlimit) {
1171 *buffer++ = UNICODE_REPLACEMENT;
1175 if (((byte1 = *t++) & 0xc0) == 0x80) {
1176 if (((byte2 = *t++) & 0xc0) == 0x80) {
1177 if (((byte3 = *t++) & 0xc0) == 0x80) {
1178 /* valid 4-byte UTF-8? */
1179 value = ((byte & 0x07) << 18)
1180 | ((byte1 & 0x3f) << 12)
1181 | ((byte2 & 0x3f) << 6)
1182 | ((byte3 & 0x3f) );
1184 if (value > 0x10FFFF) {
1185 *buffer++ = UNICODE_REPLACEMENT;
1187 else if (value > 0xFFFF) {
1188 /* we need surrogates */
1189 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1190 *buffer++ = 0xdc00 | (value & 0x03ff);
1193 *buffer++ = value; /* 16bit suffice */
1196 *buffer++ = UNICODE_REPLACEMENT;
1201 *buffer++ = UNICODE_REPLACEMENT;
1206 *buffer++ = UNICODE_REPLACEMENT;
1210 else if ((byte & 0xfc) == 0xf8) {
1211 if (t + 4 > tlimit) {
1212 *buffer++ = UNICODE_REPLACEMENT;
1217 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1219 *buffer++ = UNICODE_REPLACEMENT;
1221 else if ((byte & 0xfe) == 0xfc) {
1222 if (t + 5 > tlimit) {
1223 *buffer++ = UNICODE_REPLACEMENT;
1228 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1230 *buffer++ = UNICODE_REPLACEMENT;
1233 *buffer++ = UNICODE_REPLACEMENT;
1241 /* ASCII character, common case */
1249 /* u2_utflength ****************************************************************
1251 Returns the utf length in bytes of a u2 array.
1253 *******************************************************************************/
1255 u4 u2_utflength(u2 *text, u4 u2_length)
1257 u4 result_len = 0; /* utf length in bytes */
1258 u2 ch; /* current unicode character */
1261 for (len = 0; len < u2_length; len++) {
1262 /* next unicode character */
1265 /* determine bytes required to store unicode character as utf */
1266 if (ch && (ch < 0x80))
1268 else if (ch < 0x800)
1278 /* utf_copy ********************************************************************
1280 Copy the given utf string byte-for-byte to a buffer.
1283 buffer.......the buffer
1284 u............the utf string
1286 *******************************************************************************/
1288 void utf_copy(char *buffer, utf *u)
1290 /* our utf strings are zero-terminated (done by utf_new) */
1291 MCOPY(buffer, u->text, char, u->blength + 1);
1295 /* utf_cat *********************************************************************
1297 Append the given utf string byte-for-byte to a buffer.
1300 buffer.......the buffer
1301 u............the utf string
1303 *******************************************************************************/
1305 void utf_cat(char *buffer, utf *u)
1307 /* our utf strings are zero-terminated (done by utf_new) */
1308 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1312 /* utf_copy_classname **********************************************************
1314 Copy the given utf classname byte-for-byte to a buffer.
1315 '/' is replaced by '.'
1318 buffer.......the buffer
1319 u............the utf string
1321 *******************************************************************************/
1323 void utf_copy_classname(char *buffer, utf *u)
1332 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1334 while (srcptr != endptr) {
1343 /* utf_cat *********************************************************************
1345 Append the given utf classname byte-for-byte to a buffer.
1346 '/' is replaced by '.'
1349 buffer.......the buffer
1350 u............the utf string
1352 *******************************************************************************/
1354 void utf_cat_classname(char *buffer, utf *u)
1356 utf_copy_classname(buffer + strlen(buffer), u);
1359 /* utf_display_printable_ascii *************************************************
1361 Write utf symbol to stdout (for debugging purposes).
1362 Non-printable and non-ASCII characters are printed as '?'.
1364 *******************************************************************************/
1366 void utf_display_printable_ascii(utf *u)
1368 char *endpos; /* points behind utf string */
1369 char *utf_ptr; /* current position in utf text */
1377 endpos = UTF_END(u);
1380 while (utf_ptr < endpos) {
1381 /* read next unicode character */
1383 u2 c = utf_nextu2(&utf_ptr);
1385 if ((c >= 32) && (c <= 127))
1395 /* utf_display_printable_ascii_classname ***************************************
1397 Write utf symbol to stdout with `/' converted to `.' (for debugging
1399 Non-printable and non-ASCII characters are printed as '?'.
1401 *******************************************************************************/
1403 void utf_display_printable_ascii_classname(utf *u)
1405 char *endpos; /* points behind utf string */
1406 char *utf_ptr; /* current position in utf text */
1414 endpos = UTF_END(u);
1417 while (utf_ptr < endpos) {
1418 /* read next unicode character */
1420 u2 c = utf_nextu2(&utf_ptr);
1425 if ((c >= 32) && (c <= 127))
1435 /* utf_sprint_convert_to_latin1 ************************************************
1437 Write utf symbol into c-string (for debugging purposes).
1438 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1441 *******************************************************************************/
1443 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1445 char *endpos; /* points behind utf string */
1446 char *utf_ptr; /* current position in utf text */
1447 u2 pos = 0; /* position in c-string */
1450 strcpy(buffer, "NULL");
1454 endpos = UTF_END(u);
1457 while (utf_ptr < endpos)
1458 /* copy next unicode character */
1459 buffer[pos++] = utf_nextu2(&utf_ptr);
1461 /* terminate string */
1466 /* utf_sprint_convert_to_latin1_classname **************************************
1468 Write utf symbol into c-string with `/' converted to `.' (for debugging
1470 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1473 *******************************************************************************/
1475 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1477 char *endpos; /* points behind utf string */
1478 char *utf_ptr; /* current position in utf text */
1479 u2 pos = 0; /* position in c-string */
1482 strcpy(buffer, "NULL");
1486 endpos = UTF_END(u);
1489 while (utf_ptr < endpos) {
1490 /* copy next unicode character */
1491 u2 c = utf_nextu2(&utf_ptr);
1492 if (c == '/') c = '.';
1496 /* terminate string */
1501 /* utf_strcat_convert_to_latin1 ************************************************
1503 Like libc strcat, but uses an utf8 string.
1504 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1507 *******************************************************************************/
1509 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1511 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1515 /* utf_strcat_convert_to_latin1_classname **************************************
1517 Like libc strcat, but uses an utf8 string.
1518 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1521 *******************************************************************************/
1523 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1525 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1529 /* utf_fprint_printable_ascii **************************************************
1531 Write utf symbol into file.
1532 Non-printable and non-ASCII characters are printed as '?'.
1534 *******************************************************************************/
1536 void utf_fprint_printable_ascii(FILE *file, utf *u)
1538 char *endpos; /* points behind utf string */
1539 char *utf_ptr; /* current position in utf text */
1544 endpos = UTF_END(u);
1547 while (utf_ptr < endpos) {
1548 /* read next unicode character */
1549 u2 c = utf_nextu2(&utf_ptr);
1551 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1552 else fprintf(file, "?");
1557 /* utf_fprint_printable_ascii_classname ****************************************
1559 Write utf symbol into file with `/' converted to `.'.
1560 Non-printable and non-ASCII characters are printed as '?'.
1562 *******************************************************************************/
1564 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1566 char *endpos; /* points behind utf string */
1567 char *utf_ptr; /* current position in utf text */
1572 endpos = UTF_END(u);
1575 while (utf_ptr < endpos) {
1576 /* read next unicode character */
1577 u2 c = utf_nextu2(&utf_ptr);
1578 if (c == '/') c = '.';
1580 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1581 else fprintf(file, "?");
1586 /* is_valid_utf ****************************************************************
1588 Return true if the given string is a valid UTF-8 string.
1590 utf_ptr...points to first character
1591 end_pos...points after last character
1593 *******************************************************************************/
1595 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1597 bool is_valid_utf(char *utf_ptr, char *end_pos)
1604 if (end_pos < utf_ptr) return false;
1605 bytes = end_pos - utf_ptr;
1609 if (!c) return false; /* 0x00 is not allowed */
1610 if ((c & 0x80) == 0) continue; /* ASCII */
1612 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1613 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1614 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1615 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1616 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1617 else return false; /* invalid leading byte */
1619 if (len > 2) return false; /* Java limitation */
1621 v = (unsigned long)c & (0x3f >> len);
1623 if ((bytes -= len) < 0) return false; /* missing bytes */
1625 for (i = len; i--; ) {
1627 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1629 v = (v << 6) | (c & 0x3f);
1633 if (len != 1) return false; /* Java special */
1636 /* Sun Java seems to allow overlong UTF-8 encodings */
1638 /* if (v < min_codepoint[len]) */
1639 /* XXX throw exception? */
1642 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1643 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1645 /* even these seem to be allowed */
1646 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1653 /* is_valid_name ***************************************************************
1655 Return true if the given string may be used as a class/field/method
1656 name. (Currently this only disallows empty strings and control
1659 NOTE: The string is assumed to have passed is_valid_utf!
1661 utf_ptr...points to first character
1662 end_pos...points after last character
1664 *******************************************************************************/
1666 bool is_valid_name(char *utf_ptr, char *end_pos)
1668 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1670 while (utf_ptr < end_pos) {
1671 unsigned char c = *utf_ptr++;
1673 if (c < 0x20) return false; /* disallow control characters */
1674 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1681 bool is_valid_name_utf(utf *u)
1683 return is_valid_name(u->text, UTF_END(u));
1687 /* utf_show ********************************************************************
1689 Writes the utf symbols in the utfhash to stdout and displays the
1690 number of external hash chains grouped according to the chainlength
1691 (for debugging purposes).
1693 *******************************************************************************/
1695 #if !defined(NDEBUG)
1699 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1701 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1702 u4 max_chainlength = 0; /* maximum length of the chains */
1703 u4 sum_chainlength = 0; /* sum of the chainlengths */
1704 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1707 printf("UTF-HASH:\n");
1709 /* show element of utf-hashtable */
1711 for (i = 0; i < hashtable_utf->size; i++) {
1712 utf *u = hashtable_utf->ptr[i];
1715 printf("SLOT %d: ", (int) i);
1719 utf_display_printable_ascii(u);
1727 printf("UTF-HASH: %d slots for %d entries\n",
1728 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1730 if (hashtable_utf->entries == 0)
1733 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1735 for (i=0;i<CHAIN_LIMIT;i++)
1738 /* count numbers of hashchains according to their length */
1739 for (i=0; i<hashtable_utf->size; i++) {
1741 utf *u = (utf*) hashtable_utf->ptr[i];
1742 u4 chain_length = 0;
1744 /* determine chainlength */
1750 /* update sum of all chainlengths */
1751 sum_chainlength+=chain_length;
1753 /* determine the maximum length of the chains */
1754 if (chain_length>max_chainlength)
1755 max_chainlength = chain_length;
1757 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1758 if (chain_length>=CHAIN_LIMIT) {
1759 beyond_limit+=chain_length;
1760 chain_length=CHAIN_LIMIT-1;
1763 /* update number of hashchains of current length */
1764 chain_count[chain_length]++;
1767 /* display results */
1768 for (i=1;i<CHAIN_LIMIT-1;i++)
1769 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1771 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1774 printf("max. chainlength:%5d\n",max_chainlength);
1776 /* avg. chainlength = sum of chainlengths / number of chains */
1777 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1779 #endif /* !defined(NDEBUG) */
1783 * These are local overrides for various environment variables in Emacs.
1784 * Please do not remove this and leave it at the end of the file, where
1785 * Emacs will automagically detect them.
1786 * ---------------------------------------------------------------------
1789 * indent-tabs-mode: t
1793 * vim:noexpandtab:sw=4:ts=4: