1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
34 $Id: utf8.c 5823 2006-10-24 23:24:19Z edwin $
46 #include "mm/memory.h"
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/lock.h"
51 # include "threads/none/lock.h"
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
63 /* global variables ***********************************************************/
65 /* hashsize must be power of 2 */
67 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
69 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
72 /* utf-symbols for pointer comparison of frequently used strings **************/
74 utf *utf_java_lang_Object;
76 utf *utf_java_lang_Class;
77 utf *utf_java_lang_ClassLoader;
78 utf *utf_java_lang_Cloneable;
79 utf *utf_java_lang_SecurityManager;
80 utf *utf_java_lang_String;
81 utf *utf_java_lang_System;
82 utf *utf_java_lang_ThreadGroup;
83 utf *utf_java_io_Serializable;
85 utf *utf_java_lang_Throwable;
86 utf *utf_java_lang_VMThrowable;
87 utf *utf_java_lang_Error;
88 utf *utf_java_lang_AbstractMethodError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchMethodError;
92 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ClassCastException;
96 utf *utf_java_lang_ClassNotFoundException;
97 utf *utf_java_lang_IllegalArgumentException;
98 utf *utf_java_lang_IllegalMonitorStateException;
100 utf *utf_java_lang_NullPointerException;
102 utf* utf_java_lang_Void;
103 utf* utf_java_lang_Boolean;
104 utf* utf_java_lang_Byte;
105 utf* utf_java_lang_Character;
106 utf* utf_java_lang_Short;
107 utf* utf_java_lang_Integer;
108 utf* utf_java_lang_Long;
109 utf* utf_java_lang_Float;
110 utf* utf_java_lang_Double;
112 utf *utf_java_lang_StackTraceElement;
113 utf *utf_java_lang_reflect_Constructor;
114 utf *utf_java_lang_reflect_Field;
115 utf *utf_java_lang_reflect_Method;
116 utf *utf_java_util_Vector;
118 utf *utf_InnerClasses; /* InnerClasses */
119 utf *utf_ConstantValue; /* ConstantValue */
120 utf *utf_Code; /* Code */
121 utf *utf_Exceptions; /* Exceptions */
122 utf *utf_LineNumberTable; /* LineNumberTable */
123 utf *utf_SourceFile; /* SourceFile */
125 utf *utf_init; /* <init> */
126 utf *utf_clinit; /* <clinit> */
127 utf *utf_clone; /* clone */
128 utf *utf_finalize; /* finalize */
129 utf *utf_run; /* run */
133 utf *utf_removeThread;
138 utf *utf_fillInStackTrace;
139 utf *utf_getSystemClassLoader;
141 utf *utf_printStackTrace;
152 utf *utf_void__void; /* ()V */
153 utf *utf_boolean__void; /* (Z)V */
154 utf *utf_byte__void; /* (B)V */
155 utf *utf_char__void; /* (C)V */
156 utf *utf_short__void; /* (S)V */
157 utf *utf_int__void; /* (I)V */
158 utf *utf_long__void; /* (J)V */
159 utf *utf_float__void; /* (F)V */
160 utf *utf_double__void; /* (D)V */
162 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
163 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
164 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
165 utf *utf_java_lang_Object__java_lang_Object;
166 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
167 utf *utf_java_lang_String__java_lang_Class;
168 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
169 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
171 utf *utf_not_named_yet; /* special name for unnamed classes */
173 utf *array_packagename;
176 /* utf_init ********************************************************************
178 Initializes the utf8 subsystem.
180 *******************************************************************************/
184 /* create utf8 hashtable */
186 hashtable_utf = NEW(hashtable);
188 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
190 #if defined(ENABLE_STATISTICS)
192 count_utf_len += sizeof(utf*) * hashtable_utf->size;
195 /* create utf-symbols for pointer comparison of frequently used strings */
197 utf_java_lang_Object = utf_new_char("java/lang/Object");
199 utf_java_lang_Class = utf_new_char("java/lang/Class");
200 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
201 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
202 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
203 utf_java_lang_String = utf_new_char("java/lang/String");
204 utf_java_lang_System = utf_new_char("java/lang/System");
205 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
206 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
208 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
209 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
210 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
212 utf_java_lang_AbstractMethodError =
213 utf_new_char(string_java_lang_AbstractMethodError);
215 utf_java_lang_LinkageError =
216 utf_new_char(string_java_lang_LinkageError);
218 utf_java_lang_NoClassDefFoundError =
219 utf_new_char(string_java_lang_NoClassDefFoundError);
221 utf_java_lang_NoSuchMethodError =
222 utf_new_char(string_java_lang_NoSuchMethodError);
224 utf_java_lang_OutOfMemoryError =
225 utf_new_char(string_java_lang_OutOfMemoryError);
227 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
229 utf_java_lang_ClassCastException =
230 utf_new_char(string_java_lang_ClassCastException);
232 utf_java_lang_ClassNotFoundException =
233 utf_new_char(string_java_lang_ClassNotFoundException);
235 utf_java_lang_IllegalArgumentException =
236 utf_new_char(string_java_lang_IllegalArgumentException);
238 utf_java_lang_IllegalMonitorStateException =
239 utf_new_char(string_java_lang_IllegalMonitorStateException);
241 utf_java_lang_NullPointerException =
242 utf_new_char(string_java_lang_NullPointerException);
244 utf_java_lang_Void = utf_new_char("java/lang/Void");
245 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
246 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
247 utf_java_lang_Character = utf_new_char("java/lang/Character");
248 utf_java_lang_Short = utf_new_char("java/lang/Short");
249 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
250 utf_java_lang_Long = utf_new_char("java/lang/Long");
251 utf_java_lang_Float = utf_new_char("java/lang/Float");
252 utf_java_lang_Double = utf_new_char("java/lang/Double");
254 utf_java_lang_StackTraceElement =
255 utf_new_char("java/lang/StackTraceElement");
257 utf_java_lang_reflect_Constructor =
258 utf_new_char("java/lang/reflect/Constructor");
260 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
261 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
262 utf_java_util_Vector = utf_new_char("java/util/Vector");
264 utf_InnerClasses = utf_new_char("InnerClasses");
265 utf_ConstantValue = utf_new_char("ConstantValue");
266 utf_Code = utf_new_char("Code");
267 utf_Exceptions = utf_new_char("Exceptions");
268 utf_LineNumberTable = utf_new_char("LineNumberTable");
269 utf_SourceFile = utf_new_char("SourceFile");
271 utf_init = utf_new_char("<init>");
272 utf_clinit = utf_new_char("<clinit>");
273 utf_clone = utf_new_char("clone");
274 utf_finalize = utf_new_char("finalize");
275 utf_run = utf_new_char("run");
277 utf_add = utf_new_char("add");
278 utf_remove = utf_new_char("remove");
279 utf_removeThread = utf_new_char("removeThread");
280 utf_put = utf_new_char("put");
281 utf_get = utf_new_char("get");
282 utf_value = utf_new_char("value");
284 utf_printStackTrace = utf_new_char("printStackTrace");
285 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
286 utf_loadClass = utf_new_char("loadClass");
287 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
289 utf_Z = utf_new_char("Z");
290 utf_B = utf_new_char("B");
291 utf_C = utf_new_char("C");
292 utf_S = utf_new_char("S");
293 utf_I = utf_new_char("I");
294 utf_J = utf_new_char("J");
295 utf_F = utf_new_char("F");
296 utf_D = utf_new_char("D");
298 utf_void__void = utf_new_char("()V");
299 utf_boolean__void = utf_new_char("(Z)V");
300 utf_byte__void = utf_new_char("(B)V");
301 utf_char__void = utf_new_char("(C)V");
302 utf_short__void = utf_new_char("(S)V");
303 utf_int__void = utf_new_char("(I)V");
304 utf_long__void = utf_new_char("(J)V");
305 utf_float__void = utf_new_char("(F)V");
306 utf_double__void = utf_new_char("(D)V");
307 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
308 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
310 utf_void__java_lang_ClassLoader =
311 utf_new_char("()Ljava/lang/ClassLoader;");
313 utf_java_lang_Object__java_lang_Object =
314 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
316 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
318 utf_java_lang_String__java_lang_Class =
319 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
321 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
322 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
324 utf_null = utf_new_char("null");
325 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
326 array_packagename = utf_new_char("\t<the array package>");
328 /* everything's ok */
334 /* utf_hashkey *****************************************************************
336 The hashkey is computed from the utf-text by using up to 8
337 characters. For utf-symbols longer than 15 characters 3 characters
338 are taken from the beginning and the end, 2 characters are taken
341 *******************************************************************************/
343 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
344 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
346 u4 utf_hashkey(const char *text, u4 length)
348 const char *start_pos = text; /* pointer to utf text */
352 case 0: /* empty string */
355 case 1: return fbs(0);
356 case 2: return fbs(0) ^ nbs(3);
357 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
358 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
359 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
360 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
361 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
362 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
369 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
378 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
387 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
399 return a ^ nbs(9) ^ nbs(10);
411 return a ^ nbs(9) ^ nbs(10);
422 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
433 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
435 default: /* 3 characters from beginning */
441 /* 2 characters from middle */
442 text = start_pos + (length / 2);
447 /* 3 characters from end */
448 text = start_pos + length - 4;
453 return a ^ nbs(10) ^ nbs(11);
457 /* utf_full_hashkey ************************************************************
459 This function computes a hash value using all bytes in the string.
461 The algorithm is the "One-at-a-time" algorithm as published
462 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
464 *******************************************************************************/
466 u4 utf_full_hashkey(const char *text, u4 length)
468 register const unsigned char *p = (const unsigned char *) text;
476 hash += (hash << 10);
480 hash ^= (hash >> 11);
481 hash += (hash << 15);
486 /* unicode_hashkey *************************************************************
488 Compute the hashkey of a unicode string.
490 *******************************************************************************/
492 u4 unicode_hashkey(u2 *text, u2 len)
494 return utf_hashkey((char *) text, len);
498 /* utf_new *********************************************************************
500 Creates a new utf-symbol, the text of the symbol is passed as a
501 u1-array. The function searches the utf-hashtable for a utf-symbol
502 with this text. On success the element returned, otherwise a new
503 hashtable element is created.
505 If the number of entries in the hashtable exceeds twice the size of
506 the hashtable slots a reorganization of the hashtable is done and
507 the utf symbols are copied to a new hashtable with doubled size.
509 *******************************************************************************/
511 utf *utf_new(const char *text, u2 length)
513 u4 key; /* hashkey computed from utf-text */
514 u4 slot; /* slot in hashtable */
515 utf *u; /* hashtable element */
518 LOCK_MONITOR_ENTER(hashtable_utf->header);
520 #if defined(ENABLE_STATISTICS)
525 key = utf_hashkey(text, length);
526 slot = key & (hashtable_utf->size - 1);
527 u = hashtable_utf->ptr[slot];
529 /* search external hash chain for utf-symbol */
532 if (u->blength == length) {
533 /* compare text of hashtable elements */
535 for (i = 0; i < length; i++)
536 if (text[i] != u->text[i])
539 #if defined(ENABLE_STATISTICS)
541 count_utf_new_found++;
544 /* symbol found in hashtable */
546 LOCK_MONITOR_EXIT(hashtable_utf->header);
552 u = u->hashlink; /* next element in external chain */
555 #if defined(ENABLE_STATISTICS)
557 count_utf_len += sizeof(utf) + length + 1;
560 /* location in hashtable found, create new utf element */
562 u->blength = length; /* length in bytes of utfstring */
563 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
564 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
566 memcpy(u->text, text, length); /* copy utf-text */
567 u->text[length] = '\0';
569 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
570 hashtable_utf->entries++; /* update number of entries */
572 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
574 /* reorganization of hashtable, average length of the external
575 chains is approx. 2 */
577 hashtable *newhash; /* the new hashtable */
583 /* create new hashtable, double the size */
585 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
587 #if defined(ENABLE_STATISTICS)
589 count_utf_len += sizeof(utf*) * hashtable_utf->size;
592 /* transfer elements to new hashtable */
594 for (i = 0; i < hashtable_utf->size; i++) {
595 u = hashtable_utf->ptr[i];
599 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
601 u->hashlink = (utf *) newhash->ptr[slot];
602 newhash->ptr[slot] = u;
604 /* follow link in external hash chain */
610 /* dispose old table */
612 hashtable_free(hashtable_utf);
614 hashtable_utf = newhash;
617 LOCK_MONITOR_EXIT(hashtable_utf->header);
623 /* utf_new_u2 ******************************************************************
625 Make utf symbol from u2 array, if isclassname is true '.' is
628 *******************************************************************************/
630 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
632 char *buffer; /* memory buffer for unicode characters */
633 char *pos; /* pointer to current position in buffer */
634 u4 left; /* unicode characters left */
635 u4 buflength; /* utf length in bytes of the u2 array */
636 utf *result; /* resulting utf-string */
639 /* determine utf length in bytes and allocate memory */
641 buflength = u2_utflength(unicode_pos, unicode_length);
642 buffer = MNEW(char, buflength);
647 for (i = 0; i++ < unicode_length; unicode_pos++) {
648 /* next unicode character */
651 if ((c != 0) && (c < 0x80)) {
654 if ((int) left < 0) break;
655 /* convert classname */
656 if (isclassname && c == '.')
661 } else if (c < 0x800) {
663 unsigned char high = c >> 6;
664 unsigned char low = c & 0x3F;
666 if ((int) left < 0) break;
667 *pos++ = high | 0xC0;
673 char mid = (c >> 6) & 0x3F;
676 if ((int) left < 0) break;
677 *pos++ = high | 0xE0;
683 /* insert utf-string into symbol-table */
684 result = utf_new(buffer,buflength);
686 MFREE(buffer, char, buflength);
692 /* utf_new_char ****************************************************************
694 Creates a new utf symbol, the text for this symbol is passed as a
695 c-string ( = char* ).
697 *******************************************************************************/
699 utf *utf_new_char(const char *text)
701 return utf_new(text, strlen(text));
705 /* utf_new_char_classname ******************************************************
707 Creates a new utf symbol, the text for this symbol is passed as a
708 c-string ( = char* ) "." characters are going to be replaced by
709 "/". Since the above function is used often, this is a separte
710 function, instead of an if.
712 *******************************************************************************/
714 utf *utf_new_char_classname(const char *text)
716 if (strchr(text, '.')) {
717 char *txt = strdup(text);
718 char *end = txt + strlen(txt);
722 for (c = txt; c < end; c++)
723 if (*c == '.') *c = '/';
725 tmpRes = utf_new(txt, strlen(txt));
731 return utf_new(text, strlen(text));
735 /* utf_nextu2 ******************************************************************
737 Read the next unicode character from the utf string and increment
738 the utf-string pointer accordingly.
740 CAUTION: This function is unsafe for input that was not checked
743 *******************************************************************************/
745 u2 utf_nextu2(char **utf_ptr)
747 /* uncompressed unicode character */
749 /* current position in utf text */
750 unsigned char *utf = (unsigned char *) (*utf_ptr);
751 /* bytes representing the unicode character */
752 unsigned char ch1, ch2, ch3;
753 /* number of bytes used to represent the unicode character */
756 switch ((ch1 = utf[0]) >> 4) {
757 default: /* 1 byte */
761 case 0xD: /* 2 bytes */
762 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
763 unsigned char high = ch1 & 0x1F;
764 unsigned char low = ch2 & 0x3F;
765 unicode_char = (high << 6) + low;
770 case 0xE: /* 2 or 3 bytes */
771 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
772 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
773 unsigned char low = ch3 & 0x3f;
774 unsigned char mid = ch2 & 0x3f;
775 unsigned char high = ch1 & 0x0f;
776 unicode_char = (((high << 6) + mid) << 6) + low;
784 /* update position in utf-text */
785 *utf_ptr = (char *) (utf + len);
791 /* utf_bytes *******************************************************************
793 Determine number of bytes (aka. octets) in the utf string.
796 u............utf string
799 The number of octets of this utf string.
800 There is _no_ terminating zero included in this count.
802 *******************************************************************************/
809 /* utf_get_number_of_u2s_for_buffer ********************************************
811 Determine number of UTF-16 u2s in the given UTF-8 buffer
813 CAUTION: This function is unsafe for input that was not checked
816 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
817 to an array of u2s (UTF-16) and want to know how many of them you will get.
818 All other uses of this function are probably wrong.
821 buffer........points to first char in buffer
822 blength.......number of _bytes_ in the buffer
825 the number of u2s needed to hold this string in UTF-16 encoding.
826 There is _no_ terminating zero included in this count.
828 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
831 *******************************************************************************/
833 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
835 const char *endpos; /* points behind utf string */
836 const char *utf_ptr; /* current position in utf text */
837 u4 len = 0; /* number of unicode characters */
840 endpos = utf_ptr + blength;
842 while (utf_ptr < endpos) {
844 /* next unicode character */
845 utf_nextu2((char **)&utf_ptr);
848 assert(utf_ptr == endpos);
854 /* utf_get_number_of_u2s *******************************************************
856 Determine number of UTF-16 u2s in the utf string.
858 CAUTION: This function is unsafe for input that was not checked
861 CAUTION: Use this function *only* when you want to convert a utf string
862 to an array of u2s and want to know how many of them you will get.
863 All other uses of this function are probably wrong.
866 u............utf string
869 the number of u2s needed to hold this string in UTF-16 encoding.
870 There is _no_ terminating zero included in this count.
871 XXX 0 if a NullPointerException has been thrown (see below)
873 *******************************************************************************/
875 u4 utf_get_number_of_u2s(utf *u)
877 char *endpos; /* points behind utf string */
878 char *utf_ptr; /* current position in utf text */
879 u4 len = 0; /* number of unicode characters */
881 /* XXX this is probably not checked by most callers! Review this after */
882 /* the invalid uses of this function have been eliminated */
884 exceptions_throw_nullpointerexception();
891 while (utf_ptr < endpos) {
893 /* next unicode character */
894 utf_nextu2(&utf_ptr);
897 if (utf_ptr != endpos)
898 /* string ended abruptly */
899 throw_cacao_exception_exit(string_java_lang_InternalError,
900 "Illegal utf8 string");
906 /* utf8_safe_number_of_u2s *****************************************************
908 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
909 (For invalid UTF-8 the U+fffd replacement character will be counted.)
911 This function is safe even for invalid UTF-8 strings.
914 text..........zero-terminated(!) UTF-8 string (may be invalid)
916 nbytes........strlen(text). (This is needed to completely emulate
920 the number of u2s needed to hold this string in UTF-16 encoding.
921 There is _no_ terminating zero included in this count.
923 *******************************************************************************/
925 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
926 register const unsigned char *t;
929 register const unsigned char *tlimit;
940 t = (const unsigned char *) text;
943 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
949 /* highest bit set, non-ASCII character */
951 if ((byte & 0xe0) == 0xc0) {
952 /* 2-byte: should be 110..... 10...... ? */
954 if ((*t++ & 0xc0) == 0x80)
959 else if ((byte & 0xf0) == 0xe0) {
960 /* 3-byte: should be 1110.... 10...... 10...... */
964 return len + 1; /* invalid, stop here */
966 if ((*t++ & 0xc0) == 0x80) {
967 if ((*t++ & 0xc0) == 0x80)
975 else if ((byte & 0xf8) == 0xf0) {
976 /* 4-byte: should be 11110... 10...... 10...... 10...... */
980 return len + 1; /* invalid, stop here */
982 if (((byte1 = *t++) & 0xc0) == 0x80) {
983 if (((byte2 = *t++) & 0xc0) == 0x80) {
984 if (((byte3 = *t++) & 0xc0) == 0x80) {
985 /* valid 4-byte UTF-8? */
986 value = ((byte & 0x07) << 18)
987 | ((byte1 & 0x3f) << 12)
988 | ((byte2 & 0x3f) << 6)
991 if (value > 0x10FFFF)
993 else if (value > 0xFFFF)
994 len += 1; /* we need surrogates */
996 ; /* 16bit suffice */
1007 else if ((byte & 0xfc) == 0xf8) {
1008 /* invalid 5-byte */
1010 return len + 1; /* invalid, stop here */
1013 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1016 else if ((byte & 0xfe) == 0xfc) {
1017 /* invalid 6-byte */
1019 return len + 1; /* invalid, stop here */
1022 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1034 /* ASCII character, common case */
1044 /* utf8_safe_convert_to_u2s ****************************************************
1046 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1047 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1048 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1050 This function is safe even for invalid UTF-8 strings.
1053 text..........zero-terminated(!) UTF-8 string (may be invalid)
1055 nbytes........strlen(text). (This is needed to completely emulate
1057 buffer........a preallocated array of u2s to receive the decoded
1058 string. Use utf8_safe_number_of_u2s to get the
1059 required number of u2s for allocating this.
1061 *******************************************************************************/
1063 #define UNICODE_REPLACEMENT 0xfffd
1065 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1066 register const unsigned char *t;
1068 register const unsigned char *tlimit;
1076 assert(nbytes >= 0);
1078 t = (const unsigned char *) text;
1079 tlimit = t + nbytes;
1081 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1087 /* highest bit set, non-ASCII character */
1089 if ((byte & 0xe0) == 0xc0) {
1090 /* 2-byte: should be 110..... 10...... */
1092 if (((byte1 = *t++) & 0xc0) == 0x80) {
1093 /* valid 2-byte UTF-8 */
1094 *buffer++ = ((byte & 0x1f) << 6)
1095 | ((byte1 & 0x3f) );
1098 *buffer++ = UNICODE_REPLACEMENT;
1102 else if ((byte & 0xf0) == 0xe0) {
1103 /* 3-byte: should be 1110.... 10...... 10...... */
1105 if (t + 2 > tlimit) {
1106 *buffer++ = UNICODE_REPLACEMENT;
1110 if (((byte1 = *t++) & 0xc0) == 0x80) {
1111 if (((byte2 = *t++) & 0xc0) == 0x80) {
1112 /* valid 3-byte UTF-8 */
1113 *buffer++ = ((byte & 0x0f) << 12)
1114 | ((byte1 & 0x3f) << 6)
1115 | ((byte2 & 0x3f) );
1118 *buffer++ = UNICODE_REPLACEMENT;
1123 *buffer++ = UNICODE_REPLACEMENT;
1127 else if ((byte & 0xf8) == 0xf0) {
1128 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1130 if (t + 3 > tlimit) {
1131 *buffer++ = UNICODE_REPLACEMENT;
1135 if (((byte1 = *t++) & 0xc0) == 0x80) {
1136 if (((byte2 = *t++) & 0xc0) == 0x80) {
1137 if (((byte3 = *t++) & 0xc0) == 0x80) {
1138 /* valid 4-byte UTF-8? */
1139 value = ((byte & 0x07) << 18)
1140 | ((byte1 & 0x3f) << 12)
1141 | ((byte2 & 0x3f) << 6)
1142 | ((byte3 & 0x3f) );
1144 if (value > 0x10FFFF) {
1145 *buffer++ = UNICODE_REPLACEMENT;
1147 else if (value > 0xFFFF) {
1148 /* we need surrogates */
1149 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1150 *buffer++ = 0xdc00 | (value & 0x03ff);
1153 *buffer++ = value; /* 16bit suffice */
1156 *buffer++ = UNICODE_REPLACEMENT;
1161 *buffer++ = UNICODE_REPLACEMENT;
1166 *buffer++ = UNICODE_REPLACEMENT;
1170 else if ((byte & 0xfc) == 0xf8) {
1171 if (t + 4 > tlimit) {
1172 *buffer++ = UNICODE_REPLACEMENT;
1177 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1179 *buffer++ = UNICODE_REPLACEMENT;
1181 else if ((byte & 0xfe) == 0xfc) {
1182 if (t + 5 > tlimit) {
1183 *buffer++ = UNICODE_REPLACEMENT;
1188 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1190 *buffer++ = UNICODE_REPLACEMENT;
1193 *buffer++ = UNICODE_REPLACEMENT;
1201 /* ASCII character, common case */
1209 /* u2_utflength ****************************************************************
1211 Returns the utf length in bytes of a u2 array.
1213 *******************************************************************************/
1215 u4 u2_utflength(u2 *text, u4 u2_length)
1217 u4 result_len = 0; /* utf length in bytes */
1218 u2 ch; /* current unicode character */
1221 for (len = 0; len < u2_length; len++) {
1222 /* next unicode character */
1225 /* determine bytes required to store unicode character as utf */
1226 if (ch && (ch < 0x80))
1228 else if (ch < 0x800)
1238 /* utf_copy ********************************************************************
1240 Copy the given utf string byte-for-byte to a buffer.
1243 buffer.......the buffer
1244 u............the utf string
1246 *******************************************************************************/
1248 void utf_copy(char *buffer, utf *u)
1250 /* our utf strings are zero-terminated (done by utf_new) */
1251 MCOPY(buffer, u->text, char, u->blength + 1);
1255 /* utf_cat *********************************************************************
1257 Append the given utf string byte-for-byte to a buffer.
1260 buffer.......the buffer
1261 u............the utf string
1263 *******************************************************************************/
1265 void utf_cat(char *buffer, utf *u)
1267 /* our utf strings are zero-terminated (done by utf_new) */
1268 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1272 /* utf_copy_classname **********************************************************
1274 Copy the given utf classname byte-for-byte to a buffer.
1275 '/' is replaced by '.'
1278 buffer.......the buffer
1279 u............the utf string
1281 *******************************************************************************/
1283 void utf_copy_classname(char *buffer, utf *u)
1292 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1294 while (srcptr != endptr) {
1303 /* utf_cat *********************************************************************
1305 Append the given utf classname byte-for-byte to a buffer.
1306 '/' is replaced by '.'
1309 buffer.......the buffer
1310 u............the utf string
1312 *******************************************************************************/
1314 void utf_cat_classname(char *buffer, utf *u)
1316 utf_copy_classname(buffer + strlen(buffer), u);
1319 /* utf_display_printable_ascii *************************************************
1321 Write utf symbol to stdout (for debugging purposes).
1322 Non-printable and non-ASCII characters are printed as '?'.
1324 *******************************************************************************/
1326 void utf_display_printable_ascii(utf *u)
1328 char *endpos; /* points behind utf string */
1329 char *utf_ptr; /* current position in utf text */
1337 endpos = UTF_END(u);
1340 while (utf_ptr < endpos) {
1341 /* read next unicode character */
1343 u2 c = utf_nextu2(&utf_ptr);
1345 if ((c >= 32) && (c <= 127))
1355 /* utf_display_printable_ascii_classname ***************************************
1357 Write utf symbol to stdout with `/' converted to `.' (for debugging
1359 Non-printable and non-ASCII characters are printed as '?'.
1361 *******************************************************************************/
1363 void utf_display_printable_ascii_classname(utf *u)
1365 char *endpos; /* points behind utf string */
1366 char *utf_ptr; /* current position in utf text */
1374 endpos = UTF_END(u);
1377 while (utf_ptr < endpos) {
1378 /* read next unicode character */
1380 u2 c = utf_nextu2(&utf_ptr);
1385 if ((c >= 32) && (c <= 127))
1395 /* utf_sprint_convert_to_latin1 ************************************************
1397 Write utf symbol into c-string (for debugging purposes).
1398 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1401 *******************************************************************************/
1403 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1405 char *endpos; /* points behind utf string */
1406 char *utf_ptr; /* current position in utf text */
1407 u2 pos = 0; /* position in c-string */
1410 strcpy(buffer, "NULL");
1414 endpos = UTF_END(u);
1417 while (utf_ptr < endpos)
1418 /* copy next unicode character */
1419 buffer[pos++] = utf_nextu2(&utf_ptr);
1421 /* terminate string */
1426 /* utf_sprint_convert_to_latin1_classname **************************************
1428 Write utf symbol into c-string with `/' converted to `.' (for debugging
1430 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1433 *******************************************************************************/
1435 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1437 char *endpos; /* points behind utf string */
1438 char *utf_ptr; /* current position in utf text */
1439 u2 pos = 0; /* position in c-string */
1442 strcpy(buffer, "NULL");
1446 endpos = UTF_END(u);
1449 while (utf_ptr < endpos) {
1450 /* copy next unicode character */
1451 u2 c = utf_nextu2(&utf_ptr);
1452 if (c == '/') c = '.';
1456 /* terminate string */
1461 /* utf_strcat_convert_to_latin1 ************************************************
1463 Like libc strcat, but uses an utf8 string.
1464 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1467 *******************************************************************************/
1469 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1471 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1475 /* utf_strcat_convert_to_latin1_classname **************************************
1477 Like libc strcat, but uses an utf8 string.
1478 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1481 *******************************************************************************/
1483 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1485 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1489 /* utf_fprint_printable_ascii **************************************************
1491 Write utf symbol into file.
1492 Non-printable and non-ASCII characters are printed as '?'.
1494 *******************************************************************************/
1496 void utf_fprint_printable_ascii(FILE *file, utf *u)
1498 char *endpos; /* points behind utf string */
1499 char *utf_ptr; /* current position in utf text */
1504 endpos = UTF_END(u);
1507 while (utf_ptr < endpos) {
1508 /* read next unicode character */
1509 u2 c = utf_nextu2(&utf_ptr);
1511 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1512 else fprintf(file, "?");
1517 /* utf_fprint_printable_ascii_classname ****************************************
1519 Write utf symbol into file with `/' converted to `.'.
1520 Non-printable and non-ASCII characters are printed as '?'.
1522 *******************************************************************************/
1524 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1526 char *endpos; /* points behind utf string */
1527 char *utf_ptr; /* current position in utf text */
1532 endpos = UTF_END(u);
1535 while (utf_ptr < endpos) {
1536 /* read next unicode character */
1537 u2 c = utf_nextu2(&utf_ptr);
1538 if (c == '/') c = '.';
1540 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1541 else fprintf(file, "?");
1546 /* is_valid_utf ****************************************************************
1548 Return true if the given string is a valid UTF-8 string.
1550 utf_ptr...points to first character
1551 end_pos...points after last character
1553 *******************************************************************************/
1555 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1557 bool is_valid_utf(char *utf_ptr, char *end_pos)
1564 if (end_pos < utf_ptr) return false;
1565 bytes = end_pos - utf_ptr;
1569 if (!c) return false; /* 0x00 is not allowed */
1570 if ((c & 0x80) == 0) continue; /* ASCII */
1572 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1573 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1574 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1575 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1576 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1577 else return false; /* invalid leading byte */
1579 if (len > 2) return false; /* Java limitation */
1581 v = (unsigned long)c & (0x3f >> len);
1583 if ((bytes -= len) < 0) return false; /* missing bytes */
1585 for (i = len; i--; ) {
1587 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1589 v = (v << 6) | (c & 0x3f);
1593 if (len != 1) return false; /* Java special */
1596 /* Sun Java seems to allow overlong UTF-8 encodings */
1598 /* if (v < min_codepoint[len]) */
1599 /* XXX throw exception? */
1602 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1603 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1605 /* even these seem to be allowed */
1606 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1613 /* is_valid_name ***************************************************************
1615 Return true if the given string may be used as a class/field/method
1616 name. (Currently this only disallows empty strings and control
1619 NOTE: The string is assumed to have passed is_valid_utf!
1621 utf_ptr...points to first character
1622 end_pos...points after last character
1624 *******************************************************************************/
1626 bool is_valid_name(char *utf_ptr, char *end_pos)
1628 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1630 while (utf_ptr < end_pos) {
1631 unsigned char c = *utf_ptr++;
1633 if (c < 0x20) return false; /* disallow control characters */
1634 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1641 bool is_valid_name_utf(utf *u)
1643 return is_valid_name(u->text, UTF_END(u));
1647 /* utf_show ********************************************************************
1649 Writes the utf symbols in the utfhash to stdout and displays the
1650 number of external hash chains grouped according to the chainlength
1651 (for debugging purposes).
1653 *******************************************************************************/
1655 #if !defined(NDEBUG)
1659 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1661 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1662 u4 max_chainlength = 0; /* maximum length of the chains */
1663 u4 sum_chainlength = 0; /* sum of the chainlengths */
1664 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1667 printf("UTF-HASH:\n");
1669 /* show element of utf-hashtable */
1671 for (i = 0; i < hashtable_utf->size; i++) {
1672 utf *u = hashtable_utf->ptr[i];
1675 printf("SLOT %d: ", (int) i);
1679 utf_display_printable_ascii(u);
1687 printf("UTF-HASH: %d slots for %d entries\n",
1688 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1690 if (hashtable_utf->entries == 0)
1693 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1695 for (i=0;i<CHAIN_LIMIT;i++)
1698 /* count numbers of hashchains according to their length */
1699 for (i=0; i<hashtable_utf->size; i++) {
1701 utf *u = (utf*) hashtable_utf->ptr[i];
1702 u4 chain_length = 0;
1704 /* determine chainlength */
1710 /* update sum of all chainlengths */
1711 sum_chainlength+=chain_length;
1713 /* determine the maximum length of the chains */
1714 if (chain_length>max_chainlength)
1715 max_chainlength = chain_length;
1717 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1718 if (chain_length>=CHAIN_LIMIT) {
1719 beyond_limit+=chain_length;
1720 chain_length=CHAIN_LIMIT-1;
1723 /* update number of hashchains of current length */
1724 chain_count[chain_length]++;
1727 /* display results */
1728 for (i=1;i<CHAIN_LIMIT-1;i++)
1729 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1731 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1734 printf("max. chainlength:%5d\n",max_chainlength);
1736 /* avg. chainlength = sum of chainlengths / number of chains */
1737 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1739 #endif /* !defined(NDEBUG) */
1743 * These are local overrides for various environment variables in Emacs.
1744 * Please do not remove this and leave it at the end of the file, where
1745 * Emacs will automagically detect them.
1746 * ---------------------------------------------------------------------
1749 * indent-tabs-mode: t
1753 * vim:noexpandtab:sw=4:ts=4: