1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
33 $Id: utf8.c 6286 2007-01-10 10:03:38Z twisti $
45 #include "mm/memory.h"
47 #if defined(ENABLE_THREADS)
48 # include "threads/native/lock.h"
50 # include "threads/none/lock.h"
53 #include "vm/builtin.h"
54 #include "vm/exceptions.h"
55 #include "vm/hashtable.h"
56 #include "vm/options.h"
57 #include "vm/statistics.h"
58 #include "vm/stringlocal.h"
62 /* global variables ***********************************************************/
64 /* hashsize must be power of 2 */
66 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
68 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
71 /* utf-symbols for pointer comparison of frequently used strings **************/
73 utf *utf_java_lang_Object;
75 utf *utf_java_lang_Class;
76 utf *utf_java_lang_ClassLoader;
77 utf *utf_java_lang_Cloneable;
78 utf *utf_java_lang_SecurityManager;
79 utf *utf_java_lang_String;
80 utf *utf_java_lang_System;
81 utf *utf_java_lang_ThreadGroup;
82 utf *utf_java_io_Serializable;
84 utf *utf_java_lang_Throwable;
85 utf *utf_java_lang_Error;
86 utf *utf_java_lang_LinkageError;
87 utf *utf_java_lang_NoClassDefFoundError;
88 utf *utf_java_lang_OutOfMemoryError;
89 utf *utf_java_lang_VirtualMachineError;
91 #if defined(ENABLE_JAVASE)
92 utf *utf_java_lang_AbstractMethodError;
93 utf *utf_java_lang_NoSuchMethodError;
96 #if defined(WITH_CLASSPATH_GNU)
97 utf *utf_java_lang_VMThrowable;
100 utf *utf_java_lang_Exception;
101 utf *utf_java_lang_ClassCastException;
102 utf *utf_java_lang_ClassNotFoundException;
103 utf *utf_java_lang_IllegalArgumentException;
104 utf *utf_java_lang_IllegalMonitorStateException;
106 utf *utf_java_lang_NullPointerException;
108 #if defined(ENABLE_JAVASE)
109 utf* utf_java_lang_Void;
112 utf* utf_java_lang_Boolean;
113 utf* utf_java_lang_Byte;
114 utf* utf_java_lang_Character;
115 utf* utf_java_lang_Short;
116 utf* utf_java_lang_Integer;
117 utf* utf_java_lang_Long;
118 utf* utf_java_lang_Float;
119 utf* utf_java_lang_Double;
121 #if defined(ENABLE_JAVASE)
122 utf *utf_java_lang_StackTraceElement;
123 utf *utf_java_lang_reflect_Constructor;
124 utf *utf_java_lang_reflect_Field;
125 utf *utf_java_lang_reflect_Method;
126 utf *utf_java_util_Vector;
129 utf *utf_InnerClasses; /* InnerClasses */
130 utf *utf_ConstantValue; /* ConstantValue */
131 utf *utf_Code; /* Code */
132 utf *utf_Exceptions; /* Exceptions */
133 utf *utf_LineNumberTable; /* LineNumberTable */
134 utf *utf_SourceFile; /* SourceFile */
136 #if defined(ENABLE_JAVASE)
137 utf *utf_EnclosingMethod;
139 utf *utf_RuntimeVisibleAnnotations;
140 utf *utf_StackMapTable;
143 utf *utf_init; /* <init> */
144 utf *utf_clinit; /* <clinit> */
145 utf *utf_clone; /* clone */
146 utf *utf_finalize; /* finalize */
147 utf *utf_run; /* run */
152 utf *utf_removeThread;
157 utf *utf_fillInStackTrace;
158 utf *utf_getSystemClassLoader;
160 utf *utf_printStackTrace;
171 utf *utf_void__void; /* ()V */
172 utf *utf_boolean__void; /* (Z)V */
173 utf *utf_byte__void; /* (B)V */
174 utf *utf_char__void; /* (C)V */
175 utf *utf_short__void; /* (S)V */
176 utf *utf_int__void; /* (I)V */
177 utf *utf_long__void; /* (J)V */
178 utf *utf_float__void; /* (F)V */
179 utf *utf_double__void; /* (D)V */
181 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
182 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
183 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
184 utf *utf_java_lang_Object__java_lang_Object;
185 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
186 utf *utf_java_lang_String__java_lang_Class;
187 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
188 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
190 utf *utf_not_named_yet; /* special name for unnamed classes */
192 utf *array_packagename;
195 /* utf_init ********************************************************************
197 Initializes the utf8 subsystem.
199 *******************************************************************************/
203 /* create utf8 hashtable */
205 hashtable_utf = NEW(hashtable);
207 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
209 #if defined(ENABLE_STATISTICS)
211 count_utf_len += sizeof(utf*) * hashtable_utf->size;
214 /* create utf-symbols for pointer comparison of frequently used strings */
216 utf_java_lang_Object = utf_new_char("java/lang/Object");
218 utf_java_lang_Class = utf_new_char("java/lang/Class");
219 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
220 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
221 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
222 utf_java_lang_String = utf_new_char("java/lang/String");
223 utf_java_lang_System = utf_new_char("java/lang/System");
224 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
225 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
227 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
228 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
230 utf_java_lang_LinkageError =
231 utf_new_char(string_java_lang_LinkageError);
233 utf_java_lang_NoClassDefFoundError =
234 utf_new_char(string_java_lang_NoClassDefFoundError);
236 utf_java_lang_OutOfMemoryError =
237 utf_new_char(string_java_lang_OutOfMemoryError);
239 utf_java_lang_VirtualMachineError =
240 utf_new_char(string_java_lang_VirtualMachineError);
242 #if defined(ENABLE_JAVASE)
243 utf_java_lang_AbstractMethodError =
244 utf_new_char(string_java_lang_AbstractMethodError);
246 utf_java_lang_NoSuchMethodError =
247 utf_new_char(string_java_lang_NoSuchMethodError);
250 #if defined(WITH_CLASSPATH_GNU)
251 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
254 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
256 utf_java_lang_ClassCastException =
257 utf_new_char(string_java_lang_ClassCastException);
259 utf_java_lang_ClassNotFoundException =
260 utf_new_char(string_java_lang_ClassNotFoundException);
262 utf_java_lang_IllegalArgumentException =
263 utf_new_char(string_java_lang_IllegalArgumentException);
265 utf_java_lang_IllegalMonitorStateException =
266 utf_new_char(string_java_lang_IllegalMonitorStateException);
268 utf_java_lang_NullPointerException =
269 utf_new_char(string_java_lang_NullPointerException);
271 #if defined(ENABLE_JAVASE)
272 utf_java_lang_Void = utf_new_char("java/lang/Void");
275 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
276 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
277 utf_java_lang_Character = utf_new_char("java/lang/Character");
278 utf_java_lang_Short = utf_new_char("java/lang/Short");
279 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
280 utf_java_lang_Long = utf_new_char("java/lang/Long");
281 utf_java_lang_Float = utf_new_char("java/lang/Float");
282 utf_java_lang_Double = utf_new_char("java/lang/Double");
284 #if defined(ENABLE_JAVASE)
285 utf_java_lang_StackTraceElement =
286 utf_new_char("java/lang/StackTraceElement");
288 utf_java_lang_reflect_Constructor =
289 utf_new_char("java/lang/reflect/Constructor");
291 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
292 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
293 utf_java_util_Vector = utf_new_char("java/util/Vector");
296 utf_InnerClasses = utf_new_char("InnerClasses");
297 utf_ConstantValue = utf_new_char("ConstantValue");
298 utf_Code = utf_new_char("Code");
299 utf_Exceptions = utf_new_char("Exceptions");
300 utf_LineNumberTable = utf_new_char("LineNumberTable");
301 utf_SourceFile = utf_new_char("SourceFile");
303 #if defined(ENABLE_JAVASE)
304 utf_EnclosingMethod = utf_new_char("EnclosingMethod");
305 utf_Signature = utf_new_char("Signature");
306 utf_RuntimeVisibleAnnotations = utf_new_char("RuntimeVisibleAnnotations");
307 utf_StackMapTable = utf_new_char("StackMapTable");
310 utf_init = utf_new_char("<init>");
311 utf_clinit = utf_new_char("<clinit>");
312 utf_clone = utf_new_char("clone");
313 utf_finalize = utf_new_char("finalize");
314 utf_run = utf_new_char("run");
316 utf_add = utf_new_char("add");
317 utf_remove = utf_new_char("remove");
318 utf_addThread = utf_new_char("addThread");
319 utf_removeThread = utf_new_char("removeThread");
320 utf_put = utf_new_char("put");
321 utf_get = utf_new_char("get");
322 utf_value = utf_new_char("value");
324 utf_printStackTrace = utf_new_char("printStackTrace");
325 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
326 utf_loadClass = utf_new_char("loadClass");
327 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
329 utf_Z = utf_new_char("Z");
330 utf_B = utf_new_char("B");
331 utf_C = utf_new_char("C");
332 utf_S = utf_new_char("S");
333 utf_I = utf_new_char("I");
334 utf_J = utf_new_char("J");
335 utf_F = utf_new_char("F");
336 utf_D = utf_new_char("D");
338 utf_void__void = utf_new_char("()V");
339 utf_boolean__void = utf_new_char("(Z)V");
340 utf_byte__void = utf_new_char("(B)V");
341 utf_char__void = utf_new_char("(C)V");
342 utf_short__void = utf_new_char("(S)V");
343 utf_int__void = utf_new_char("(I)V");
344 utf_long__void = utf_new_char("(J)V");
345 utf_float__void = utf_new_char("(F)V");
346 utf_double__void = utf_new_char("(D)V");
347 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
348 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
350 utf_void__java_lang_ClassLoader =
351 utf_new_char("()Ljava/lang/ClassLoader;");
353 utf_java_lang_Object__java_lang_Object =
354 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
356 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
358 utf_java_lang_String__java_lang_Class =
359 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
361 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
362 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
364 utf_null = utf_new_char("null");
365 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
366 array_packagename = utf_new_char("\t<the array package>");
368 /* everything's ok */
374 /* utf_hashkey *****************************************************************
376 The hashkey is computed from the utf-text by using up to 8
377 characters. For utf-symbols longer than 15 characters 3 characters
378 are taken from the beginning and the end, 2 characters are taken
381 *******************************************************************************/
383 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
384 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
386 u4 utf_hashkey(const char *text, u4 length)
388 const char *start_pos = text; /* pointer to utf text */
392 case 0: /* empty string */
395 case 1: return fbs(0);
396 case 2: return fbs(0) ^ nbs(3);
397 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
398 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
399 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
400 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
401 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
402 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
409 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
418 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
427 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
439 return a ^ nbs(9) ^ nbs(10);
451 return a ^ nbs(9) ^ nbs(10);
462 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
473 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
475 default: /* 3 characters from beginning */
481 /* 2 characters from middle */
482 text = start_pos + (length / 2);
487 /* 3 characters from end */
488 text = start_pos + length - 4;
493 return a ^ nbs(10) ^ nbs(11);
497 /* utf_full_hashkey ************************************************************
499 This function computes a hash value using all bytes in the string.
501 The algorithm is the "One-at-a-time" algorithm as published
502 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
504 *******************************************************************************/
506 u4 utf_full_hashkey(const char *text, u4 length)
508 register const unsigned char *p = (const unsigned char *) text;
516 hash += (hash << 10);
520 hash ^= (hash >> 11);
521 hash += (hash << 15);
526 /* unicode_hashkey *************************************************************
528 Compute the hashkey of a unicode string.
530 *******************************************************************************/
532 u4 unicode_hashkey(u2 *text, u2 len)
534 return utf_hashkey((char *) text, len);
538 /* utf_new *********************************************************************
540 Creates a new utf-symbol, the text of the symbol is passed as a
541 u1-array. The function searches the utf-hashtable for a utf-symbol
542 with this text. On success the element returned, otherwise a new
543 hashtable element is created.
545 If the number of entries in the hashtable exceeds twice the size of
546 the hashtable slots a reorganization of the hashtable is done and
547 the utf symbols are copied to a new hashtable with doubled size.
549 *******************************************************************************/
551 utf *utf_new(const char *text, u2 length)
553 u4 key; /* hashkey computed from utf-text */
554 u4 slot; /* slot in hashtable */
555 utf *u; /* hashtable element */
558 LOCK_MONITOR_ENTER(hashtable_utf->header);
560 #if defined(ENABLE_STATISTICS)
565 key = utf_hashkey(text, length);
566 slot = key & (hashtable_utf->size - 1);
567 u = hashtable_utf->ptr[slot];
569 /* search external hash chain for utf-symbol */
572 if (u->blength == length) {
573 /* compare text of hashtable elements */
575 for (i = 0; i < length; i++)
576 if (text[i] != u->text[i])
579 #if defined(ENABLE_STATISTICS)
581 count_utf_new_found++;
584 /* symbol found in hashtable */
586 LOCK_MONITOR_EXIT(hashtable_utf->header);
592 u = u->hashlink; /* next element in external chain */
595 #if defined(ENABLE_STATISTICS)
597 count_utf_len += sizeof(utf) + length + 1;
600 /* location in hashtable found, create new utf element */
602 u->blength = length; /* length in bytes of utfstring */
603 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
604 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
606 memcpy(u->text, text, length); /* copy utf-text */
607 u->text[length] = '\0';
609 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
610 hashtable_utf->entries++; /* update number of entries */
612 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
614 /* reorganization of hashtable, average length of the external
615 chains is approx. 2 */
617 hashtable *newhash; /* the new hashtable */
623 /* create new hashtable, double the size */
625 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
627 #if defined(ENABLE_STATISTICS)
629 count_utf_len += sizeof(utf*) * hashtable_utf->size;
632 /* transfer elements to new hashtable */
634 for (i = 0; i < hashtable_utf->size; i++) {
635 u = hashtable_utf->ptr[i];
639 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
641 u->hashlink = (utf *) newhash->ptr[slot];
642 newhash->ptr[slot] = u;
644 /* follow link in external hash chain */
650 /* dispose old table */
652 hashtable_free(hashtable_utf);
654 hashtable_utf = newhash;
657 LOCK_MONITOR_EXIT(hashtable_utf->header);
663 /* utf_new_u2 ******************************************************************
665 Make utf symbol from u2 array, if isclassname is true '.' is
668 *******************************************************************************/
670 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
672 char *buffer; /* memory buffer for unicode characters */
673 char *pos; /* pointer to current position in buffer */
674 u4 left; /* unicode characters left */
675 u4 buflength; /* utf length in bytes of the u2 array */
676 utf *result; /* resulting utf-string */
679 /* determine utf length in bytes and allocate memory */
681 buflength = u2_utflength(unicode_pos, unicode_length);
682 buffer = MNEW(char, buflength);
687 for (i = 0; i++ < unicode_length; unicode_pos++) {
688 /* next unicode character */
691 if ((c != 0) && (c < 0x80)) {
694 if ((int) left < 0) break;
695 /* convert classname */
696 if (isclassname && c == '.')
701 } else if (c < 0x800) {
703 unsigned char high = c >> 6;
704 unsigned char low = c & 0x3F;
706 if ((int) left < 0) break;
707 *pos++ = high | 0xC0;
713 char mid = (c >> 6) & 0x3F;
716 if ((int) left < 0) break;
717 *pos++ = high | 0xE0;
723 /* insert utf-string into symbol-table */
724 result = utf_new(buffer,buflength);
726 MFREE(buffer, char, buflength);
732 /* utf_new_char ****************************************************************
734 Creates a new utf symbol, the text for this symbol is passed as a
735 c-string ( = char* ).
737 *******************************************************************************/
739 utf *utf_new_char(const char *text)
741 return utf_new(text, strlen(text));
745 /* utf_new_char_classname ******************************************************
747 Creates a new utf symbol, the text for this symbol is passed as a
748 c-string ( = char* ) "." characters are going to be replaced by
749 "/". Since the above function is used often, this is a separte
750 function, instead of an if.
752 *******************************************************************************/
754 utf *utf_new_char_classname(const char *text)
756 if (strchr(text, '.')) {
757 char *txt = strdup(text);
758 char *end = txt + strlen(txt);
762 for (c = txt; c < end; c++)
763 if (*c == '.') *c = '/';
765 tmpRes = utf_new(txt, strlen(txt));
771 return utf_new(text, strlen(text));
775 /* utf_nextu2 ******************************************************************
777 Read the next unicode character from the utf string and increment
778 the utf-string pointer accordingly.
780 CAUTION: This function is unsafe for input that was not checked
783 *******************************************************************************/
785 u2 utf_nextu2(char **utf_ptr)
787 /* uncompressed unicode character */
789 /* current position in utf text */
790 unsigned char *utf = (unsigned char *) (*utf_ptr);
791 /* bytes representing the unicode character */
792 unsigned char ch1, ch2, ch3;
793 /* number of bytes used to represent the unicode character */
796 switch ((ch1 = utf[0]) >> 4) {
797 default: /* 1 byte */
801 case 0xD: /* 2 bytes */
802 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
803 unsigned char high = ch1 & 0x1F;
804 unsigned char low = ch2 & 0x3F;
805 unicode_char = (high << 6) + low;
810 case 0xE: /* 2 or 3 bytes */
811 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
812 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
813 unsigned char low = ch3 & 0x3f;
814 unsigned char mid = ch2 & 0x3f;
815 unsigned char high = ch1 & 0x0f;
816 unicode_char = (((high << 6) + mid) << 6) + low;
824 /* update position in utf-text */
825 *utf_ptr = (char *) (utf + len);
831 /* utf_bytes *******************************************************************
833 Determine number of bytes (aka. octets) in the utf string.
836 u............utf string
839 The number of octets of this utf string.
840 There is _no_ terminating zero included in this count.
842 *******************************************************************************/
849 /* utf_get_number_of_u2s_for_buffer ********************************************
851 Determine number of UTF-16 u2s in the given UTF-8 buffer
853 CAUTION: This function is unsafe for input that was not checked
856 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
857 to an array of u2s (UTF-16) and want to know how many of them you will get.
858 All other uses of this function are probably wrong.
861 buffer........points to first char in buffer
862 blength.......number of _bytes_ in the buffer
865 the number of u2s needed to hold this string in UTF-16 encoding.
866 There is _no_ terminating zero included in this count.
868 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
871 *******************************************************************************/
873 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
875 const char *endpos; /* points behind utf string */
876 const char *utf_ptr; /* current position in utf text */
877 u4 len = 0; /* number of unicode characters */
880 endpos = utf_ptr + blength;
882 while (utf_ptr < endpos) {
884 /* next unicode character */
885 utf_nextu2((char **)&utf_ptr);
888 assert(utf_ptr == endpos);
894 /* utf_get_number_of_u2s *******************************************************
896 Determine number of UTF-16 u2s in the utf string.
898 CAUTION: This function is unsafe for input that was not checked
901 CAUTION: Use this function *only* when you want to convert a utf string
902 to an array of u2s and want to know how many of them you will get.
903 All other uses of this function are probably wrong.
906 u............utf string
909 the number of u2s needed to hold this string in UTF-16 encoding.
910 There is _no_ terminating zero included in this count.
911 XXX 0 if a NullPointerException has been thrown (see below)
913 *******************************************************************************/
915 u4 utf_get_number_of_u2s(utf *u)
917 char *endpos; /* points behind utf string */
918 char *utf_ptr; /* current position in utf text */
919 u4 len = 0; /* number of unicode characters */
921 /* XXX this is probably not checked by most callers! Review this after */
922 /* the invalid uses of this function have been eliminated */
924 exceptions_throw_nullpointerexception();
931 while (utf_ptr < endpos) {
933 /* next unicode character */
934 utf_nextu2(&utf_ptr);
937 if (utf_ptr != endpos) {
938 /* string ended abruptly */
939 exceptions_throw_internalerror("Illegal utf8 string");
947 /* utf8_safe_number_of_u2s *****************************************************
949 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
950 (For invalid UTF-8 the U+fffd replacement character will be counted.)
952 This function is safe even for invalid UTF-8 strings.
955 text..........zero-terminated(!) UTF-8 string (may be invalid)
957 nbytes........strlen(text). (This is needed to completely emulate
961 the number of u2s needed to hold this string in UTF-16 encoding.
962 There is _no_ terminating zero included in this count.
964 *******************************************************************************/
966 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
967 register const unsigned char *t;
970 register const unsigned char *tlimit;
981 t = (const unsigned char *) text;
984 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
990 /* highest bit set, non-ASCII character */
992 if ((byte & 0xe0) == 0xc0) {
993 /* 2-byte: should be 110..... 10...... ? */
995 if ((*t++ & 0xc0) == 0x80)
1000 else if ((byte & 0xf0) == 0xe0) {
1001 /* 3-byte: should be 1110.... 10...... 10...... */
1005 return len + 1; /* invalid, stop here */
1007 if ((*t++ & 0xc0) == 0x80) {
1008 if ((*t++ & 0xc0) == 0x80)
1009 ; /* valid 3-byte */
1016 else if ((byte & 0xf8) == 0xf0) {
1017 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1021 return len + 1; /* invalid, stop here */
1023 if (((byte1 = *t++) & 0xc0) == 0x80) {
1024 if (((byte2 = *t++) & 0xc0) == 0x80) {
1025 if (((byte3 = *t++) & 0xc0) == 0x80) {
1026 /* valid 4-byte UTF-8? */
1027 value = ((byte & 0x07) << 18)
1028 | ((byte1 & 0x3f) << 12)
1029 | ((byte2 & 0x3f) << 6)
1030 | ((byte3 & 0x3f) );
1032 if (value > 0x10FFFF)
1034 else if (value > 0xFFFF)
1035 len += 1; /* we need surrogates */
1037 ; /* 16bit suffice */
1048 else if ((byte & 0xfc) == 0xf8) {
1049 /* invalid 5-byte */
1051 return len + 1; /* invalid, stop here */
1054 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1057 else if ((byte & 0xfe) == 0xfc) {
1058 /* invalid 6-byte */
1060 return len + 1; /* invalid, stop here */
1063 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1075 /* ASCII character, common case */
1085 /* utf8_safe_convert_to_u2s ****************************************************
1087 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1088 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1089 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1091 This function is safe even for invalid UTF-8 strings.
1094 text..........zero-terminated(!) UTF-8 string (may be invalid)
1096 nbytes........strlen(text). (This is needed to completely emulate
1098 buffer........a preallocated array of u2s to receive the decoded
1099 string. Use utf8_safe_number_of_u2s to get the
1100 required number of u2s for allocating this.
1102 *******************************************************************************/
1104 #define UNICODE_REPLACEMENT 0xfffd
1106 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1107 register const unsigned char *t;
1109 register const unsigned char *tlimit;
1117 assert(nbytes >= 0);
1119 t = (const unsigned char *) text;
1120 tlimit = t + nbytes;
1122 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1128 /* highest bit set, non-ASCII character */
1130 if ((byte & 0xe0) == 0xc0) {
1131 /* 2-byte: should be 110..... 10...... */
1133 if (((byte1 = *t++) & 0xc0) == 0x80) {
1134 /* valid 2-byte UTF-8 */
1135 *buffer++ = ((byte & 0x1f) << 6)
1136 | ((byte1 & 0x3f) );
1139 *buffer++ = UNICODE_REPLACEMENT;
1143 else if ((byte & 0xf0) == 0xe0) {
1144 /* 3-byte: should be 1110.... 10...... 10...... */
1146 if (t + 2 > tlimit) {
1147 *buffer++ = UNICODE_REPLACEMENT;
1151 if (((byte1 = *t++) & 0xc0) == 0x80) {
1152 if (((byte2 = *t++) & 0xc0) == 0x80) {
1153 /* valid 3-byte UTF-8 */
1154 *buffer++ = ((byte & 0x0f) << 12)
1155 | ((byte1 & 0x3f) << 6)
1156 | ((byte2 & 0x3f) );
1159 *buffer++ = UNICODE_REPLACEMENT;
1164 *buffer++ = UNICODE_REPLACEMENT;
1168 else if ((byte & 0xf8) == 0xf0) {
1169 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1171 if (t + 3 > tlimit) {
1172 *buffer++ = UNICODE_REPLACEMENT;
1176 if (((byte1 = *t++) & 0xc0) == 0x80) {
1177 if (((byte2 = *t++) & 0xc0) == 0x80) {
1178 if (((byte3 = *t++) & 0xc0) == 0x80) {
1179 /* valid 4-byte UTF-8? */
1180 value = ((byte & 0x07) << 18)
1181 | ((byte1 & 0x3f) << 12)
1182 | ((byte2 & 0x3f) << 6)
1183 | ((byte3 & 0x3f) );
1185 if (value > 0x10FFFF) {
1186 *buffer++ = UNICODE_REPLACEMENT;
1188 else if (value > 0xFFFF) {
1189 /* we need surrogates */
1190 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1191 *buffer++ = 0xdc00 | (value & 0x03ff);
1194 *buffer++ = value; /* 16bit suffice */
1197 *buffer++ = UNICODE_REPLACEMENT;
1202 *buffer++ = UNICODE_REPLACEMENT;
1207 *buffer++ = UNICODE_REPLACEMENT;
1211 else if ((byte & 0xfc) == 0xf8) {
1212 if (t + 4 > tlimit) {
1213 *buffer++ = UNICODE_REPLACEMENT;
1218 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1220 *buffer++ = UNICODE_REPLACEMENT;
1222 else if ((byte & 0xfe) == 0xfc) {
1223 if (t + 5 > tlimit) {
1224 *buffer++ = UNICODE_REPLACEMENT;
1229 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1231 *buffer++ = UNICODE_REPLACEMENT;
1234 *buffer++ = UNICODE_REPLACEMENT;
1242 /* ASCII character, common case */
1250 /* u2_utflength ****************************************************************
1252 Returns the utf length in bytes of a u2 array.
1254 *******************************************************************************/
1256 u4 u2_utflength(u2 *text, u4 u2_length)
1258 u4 result_len = 0; /* utf length in bytes */
1259 u2 ch; /* current unicode character */
1262 for (len = 0; len < u2_length; len++) {
1263 /* next unicode character */
1266 /* determine bytes required to store unicode character as utf */
1267 if (ch && (ch < 0x80))
1269 else if (ch < 0x800)
1279 /* utf_copy ********************************************************************
1281 Copy the given utf string byte-for-byte to a buffer.
1284 buffer.......the buffer
1285 u............the utf string
1287 *******************************************************************************/
1289 void utf_copy(char *buffer, utf *u)
1291 /* our utf strings are zero-terminated (done by utf_new) */
1292 MCOPY(buffer, u->text, char, u->blength + 1);
1296 /* utf_cat *********************************************************************
1298 Append the given utf string byte-for-byte to a buffer.
1301 buffer.......the buffer
1302 u............the utf string
1304 *******************************************************************************/
1306 void utf_cat(char *buffer, utf *u)
1308 /* our utf strings are zero-terminated (done by utf_new) */
1309 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1313 /* utf_copy_classname **********************************************************
1315 Copy the given utf classname byte-for-byte to a buffer.
1316 '/' is replaced by '.'
1319 buffer.......the buffer
1320 u............the utf string
1322 *******************************************************************************/
1324 void utf_copy_classname(char *buffer, utf *u)
1333 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1335 while (srcptr != endptr) {
1344 /* utf_cat *********************************************************************
1346 Append the given utf classname byte-for-byte to a buffer.
1347 '/' is replaced by '.'
1350 buffer.......the buffer
1351 u............the utf string
1353 *******************************************************************************/
1355 void utf_cat_classname(char *buffer, utf *u)
1357 utf_copy_classname(buffer + strlen(buffer), u);
1360 /* utf_display_printable_ascii *************************************************
1362 Write utf symbol to stdout (for debugging purposes).
1363 Non-printable and non-ASCII characters are printed as '?'.
1365 *******************************************************************************/
1367 void utf_display_printable_ascii(utf *u)
1369 char *endpos; /* points behind utf string */
1370 char *utf_ptr; /* current position in utf text */
1378 endpos = UTF_END(u);
1381 while (utf_ptr < endpos) {
1382 /* read next unicode character */
1384 u2 c = utf_nextu2(&utf_ptr);
1386 if ((c >= 32) && (c <= 127))
1396 /* utf_display_printable_ascii_classname ***************************************
1398 Write utf symbol to stdout with `/' converted to `.' (for debugging
1400 Non-printable and non-ASCII characters are printed as '?'.
1402 *******************************************************************************/
1404 void utf_display_printable_ascii_classname(utf *u)
1406 char *endpos; /* points behind utf string */
1407 char *utf_ptr; /* current position in utf text */
1415 endpos = UTF_END(u);
1418 while (utf_ptr < endpos) {
1419 /* read next unicode character */
1421 u2 c = utf_nextu2(&utf_ptr);
1426 if ((c >= 32) && (c <= 127))
1436 /* utf_sprint_convert_to_latin1 ************************************************
1438 Write utf symbol into c-string (for debugging purposes).
1439 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1442 *******************************************************************************/
1444 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1446 char *endpos; /* points behind utf string */
1447 char *utf_ptr; /* current position in utf text */
1448 u2 pos = 0; /* position in c-string */
1451 strcpy(buffer, "NULL");
1455 endpos = UTF_END(u);
1458 while (utf_ptr < endpos)
1459 /* copy next unicode character */
1460 buffer[pos++] = utf_nextu2(&utf_ptr);
1462 /* terminate string */
1467 /* utf_sprint_convert_to_latin1_classname **************************************
1469 Write utf symbol into c-string with `/' converted to `.' (for debugging
1471 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1474 *******************************************************************************/
1476 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1478 char *endpos; /* points behind utf string */
1479 char *utf_ptr; /* current position in utf text */
1480 u2 pos = 0; /* position in c-string */
1483 strcpy(buffer, "NULL");
1487 endpos = UTF_END(u);
1490 while (utf_ptr < endpos) {
1491 /* copy next unicode character */
1492 u2 c = utf_nextu2(&utf_ptr);
1493 if (c == '/') c = '.';
1497 /* terminate string */
1502 /* utf_strcat_convert_to_latin1 ************************************************
1504 Like libc strcat, but uses an utf8 string.
1505 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1508 *******************************************************************************/
1510 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1512 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1516 /* utf_strcat_convert_to_latin1_classname **************************************
1518 Like libc strcat, but uses an utf8 string.
1519 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1522 *******************************************************************************/
1524 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1526 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1530 /* utf_fprint_printable_ascii **************************************************
1532 Write utf symbol into file.
1533 Non-printable and non-ASCII characters are printed as '?'.
1535 *******************************************************************************/
1537 void utf_fprint_printable_ascii(FILE *file, utf *u)
1539 char *endpos; /* points behind utf string */
1540 char *utf_ptr; /* current position in utf text */
1545 endpos = UTF_END(u);
1548 while (utf_ptr < endpos) {
1549 /* read next unicode character */
1550 u2 c = utf_nextu2(&utf_ptr);
1552 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1553 else fprintf(file, "?");
1558 /* utf_fprint_printable_ascii_classname ****************************************
1560 Write utf symbol into file with `/' converted to `.'.
1561 Non-printable and non-ASCII characters are printed as '?'.
1563 *******************************************************************************/
1565 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1567 char *endpos; /* points behind utf string */
1568 char *utf_ptr; /* current position in utf text */
1573 endpos = UTF_END(u);
1576 while (utf_ptr < endpos) {
1577 /* read next unicode character */
1578 u2 c = utf_nextu2(&utf_ptr);
1579 if (c == '/') c = '.';
1581 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1582 else fprintf(file, "?");
1587 /* is_valid_utf ****************************************************************
1589 Return true if the given string is a valid UTF-8 string.
1591 utf_ptr...points to first character
1592 end_pos...points after last character
1594 *******************************************************************************/
1596 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1598 bool is_valid_utf(char *utf_ptr, char *end_pos)
1605 if (end_pos < utf_ptr) return false;
1606 bytes = end_pos - utf_ptr;
1610 if (!c) return false; /* 0x00 is not allowed */
1611 if ((c & 0x80) == 0) continue; /* ASCII */
1613 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1614 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1615 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1616 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1617 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1618 else return false; /* invalid leading byte */
1620 if (len > 2) return false; /* Java limitation */
1622 v = (unsigned long)c & (0x3f >> len);
1624 if ((bytes -= len) < 0) return false; /* missing bytes */
1626 for (i = len; i--; ) {
1628 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1630 v = (v << 6) | (c & 0x3f);
1634 if (len != 1) return false; /* Java special */
1637 /* Sun Java seems to allow overlong UTF-8 encodings */
1639 /* if (v < min_codepoint[len]) */
1640 /* XXX throw exception? */
1643 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1644 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1646 /* even these seem to be allowed */
1647 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1654 /* is_valid_name ***************************************************************
1656 Return true if the given string may be used as a class/field/method
1657 name. (Currently this only disallows empty strings and control
1660 NOTE: The string is assumed to have passed is_valid_utf!
1662 utf_ptr...points to first character
1663 end_pos...points after last character
1665 *******************************************************************************/
1667 bool is_valid_name(char *utf_ptr, char *end_pos)
1669 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1671 while (utf_ptr < end_pos) {
1672 unsigned char c = *utf_ptr++;
1674 if (c < 0x20) return false; /* disallow control characters */
1675 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1682 bool is_valid_name_utf(utf *u)
1684 return is_valid_name(u->text, UTF_END(u));
1688 /* utf_show ********************************************************************
1690 Writes the utf symbols in the utfhash to stdout and displays the
1691 number of external hash chains grouped according to the chainlength
1692 (for debugging purposes).
1694 *******************************************************************************/
1696 #if !defined(NDEBUG)
1700 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1702 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1703 u4 max_chainlength = 0; /* maximum length of the chains */
1704 u4 sum_chainlength = 0; /* sum of the chainlengths */
1705 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1708 printf("UTF-HASH:\n");
1710 /* show element of utf-hashtable */
1712 for (i = 0; i < hashtable_utf->size; i++) {
1713 utf *u = hashtable_utf->ptr[i];
1716 printf("SLOT %d: ", (int) i);
1720 utf_display_printable_ascii(u);
1728 printf("UTF-HASH: %d slots for %d entries\n",
1729 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1731 if (hashtable_utf->entries == 0)
1734 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1736 for (i=0;i<CHAIN_LIMIT;i++)
1739 /* count numbers of hashchains according to their length */
1740 for (i=0; i<hashtable_utf->size; i++) {
1742 utf *u = (utf*) hashtable_utf->ptr[i];
1743 u4 chain_length = 0;
1745 /* determine chainlength */
1751 /* update sum of all chainlengths */
1752 sum_chainlength+=chain_length;
1754 /* determine the maximum length of the chains */
1755 if (chain_length>max_chainlength)
1756 max_chainlength = chain_length;
1758 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1759 if (chain_length>=CHAIN_LIMIT) {
1760 beyond_limit+=chain_length;
1761 chain_length=CHAIN_LIMIT-1;
1764 /* update number of hashchains of current length */
1765 chain_count[chain_length]++;
1768 /* display results */
1769 for (i=1;i<CHAIN_LIMIT-1;i++)
1770 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1772 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1775 printf("max. chainlength:%5d\n",max_chainlength);
1777 /* avg. chainlength = sum of chainlengths / number of chains */
1778 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1780 #endif /* !defined(NDEBUG) */
1784 * These are local overrides for various environment variables in Emacs.
1785 * Please do not remove this and leave it at the end of the file, where
1786 * Emacs will automagically detect them.
1787 * ---------------------------------------------------------------------
1790 * indent-tabs-mode: t
1794 * vim:noexpandtab:sw=4:ts=4: