1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
34 $Id: utf8.c 5821 2006-10-24 16:41:54Z edwin $
46 #include "mm/memory.h"
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/lock.h"
51 # include "threads/none/lock.h"
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
63 /* global variables ***********************************************************/
65 /* hashsize must be power of 2 */
67 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
69 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
72 /* utf-symbols for pointer comparison of frequently used strings **************/
74 utf *utf_java_lang_Object;
76 utf *utf_java_lang_Class;
77 utf *utf_java_lang_ClassLoader;
78 utf *utf_java_lang_Cloneable;
79 utf *utf_java_lang_SecurityManager;
80 utf *utf_java_lang_String;
81 utf *utf_java_lang_System;
82 utf *utf_java_lang_ThreadGroup;
83 utf *utf_java_io_Serializable;
85 utf *utf_java_lang_Throwable;
86 utf *utf_java_lang_VMThrowable;
87 utf *utf_java_lang_Error;
88 utf *utf_java_lang_AbstractMethodError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchMethodError;
92 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ClassCastException;
96 utf *utf_java_lang_ClassNotFoundException;
97 utf *utf_java_lang_IllegalArgumentException;
98 utf *utf_java_lang_IllegalMonitorStateException;
100 utf *utf_java_lang_NullPointerException;
102 utf* utf_java_lang_Void;
103 utf* utf_java_lang_Boolean;
104 utf* utf_java_lang_Byte;
105 utf* utf_java_lang_Character;
106 utf* utf_java_lang_Short;
107 utf* utf_java_lang_Integer;
108 utf* utf_java_lang_Long;
109 utf* utf_java_lang_Float;
110 utf* utf_java_lang_Double;
112 utf *utf_java_lang_StackTraceElement;
113 utf *utf_java_lang_reflect_Constructor;
114 utf *utf_java_lang_reflect_Field;
115 utf *utf_java_lang_reflect_Method;
116 utf *utf_java_util_Vector;
118 utf *utf_InnerClasses; /* InnerClasses */
119 utf *utf_ConstantValue; /* ConstantValue */
120 utf *utf_Code; /* Code */
121 utf *utf_Exceptions; /* Exceptions */
122 utf *utf_LineNumberTable; /* LineNumberTable */
123 utf *utf_SourceFile; /* SourceFile */
125 utf *utf_init; /* <init> */
126 utf *utf_clinit; /* <clinit> */
127 utf *utf_clone; /* clone */
128 utf *utf_finalize; /* finalize */
129 utf *utf_run; /* run */
133 utf *utf_removeThread;
138 utf *utf_fillInStackTrace;
139 utf *utf_getSystemClassLoader;
141 utf *utf_printStackTrace;
152 utf *utf_void__void; /* ()V */
153 utf *utf_boolean__void; /* (Z)V */
154 utf *utf_byte__void; /* (B)V */
155 utf *utf_char__void; /* (C)V */
156 utf *utf_short__void; /* (S)V */
157 utf *utf_int__void; /* (I)V */
158 utf *utf_long__void; /* (J)V */
159 utf *utf_float__void; /* (F)V */
160 utf *utf_double__void; /* (D)V */
162 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
163 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
164 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
165 utf *utf_java_lang_Object__java_lang_Object;
166 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
167 utf *utf_java_lang_String__java_lang_Class;
168 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
169 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
171 utf *utf_not_named_yet; /* special name for unnamed classes */
173 utf *array_packagename;
176 /* utf_init ********************************************************************
178 Initializes the utf8 subsystem.
180 *******************************************************************************/
184 /* create utf8 hashtable */
186 hashtable_utf = NEW(hashtable);
188 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
190 #if defined(ENABLE_STATISTICS)
192 count_utf_len += sizeof(utf*) * hashtable_utf->size;
195 /* create utf-symbols for pointer comparison of frequently used strings */
197 utf_java_lang_Object = utf_new_char("java/lang/Object");
199 utf_java_lang_Class = utf_new_char("java/lang/Class");
200 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
201 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
202 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
203 utf_java_lang_String = utf_new_char("java/lang/String");
204 utf_java_lang_System = utf_new_char("java/lang/System");
205 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
206 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
208 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
209 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
210 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
212 utf_java_lang_AbstractMethodError =
213 utf_new_char(string_java_lang_AbstractMethodError);
215 utf_java_lang_LinkageError =
216 utf_new_char(string_java_lang_LinkageError);
218 utf_java_lang_NoClassDefFoundError =
219 utf_new_char(string_java_lang_NoClassDefFoundError);
221 utf_java_lang_NoSuchMethodError =
222 utf_new_char(string_java_lang_NoSuchMethodError);
224 utf_java_lang_OutOfMemoryError =
225 utf_new_char(string_java_lang_OutOfMemoryError);
227 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
229 utf_java_lang_ClassCastException =
230 utf_new_char(string_java_lang_ClassCastException);
232 utf_java_lang_ClassNotFoundException =
233 utf_new_char(string_java_lang_ClassNotFoundException);
235 utf_java_lang_IllegalArgumentException =
236 utf_new_char(string_java_lang_IllegalArgumentException);
238 utf_java_lang_IllegalMonitorStateException =
239 utf_new_char(string_java_lang_IllegalMonitorStateException);
241 utf_java_lang_NullPointerException =
242 utf_new_char(string_java_lang_NullPointerException);
244 utf_java_lang_Void = utf_new_char("java/lang/Void");
245 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
246 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
247 utf_java_lang_Character = utf_new_char("java/lang/Character");
248 utf_java_lang_Short = utf_new_char("java/lang/Short");
249 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
250 utf_java_lang_Long = utf_new_char("java/lang/Long");
251 utf_java_lang_Float = utf_new_char("java/lang/Float");
252 utf_java_lang_Double = utf_new_char("java/lang/Double");
254 utf_java_lang_StackTraceElement =
255 utf_new_char("java/lang/StackTraceElement");
257 utf_java_lang_reflect_Constructor =
258 utf_new_char("java/lang/reflect/Constructor");
260 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
261 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
262 utf_java_util_Vector = utf_new_char("java/util/Vector");
264 utf_InnerClasses = utf_new_char("InnerClasses");
265 utf_ConstantValue = utf_new_char("ConstantValue");
266 utf_Code = utf_new_char("Code");
267 utf_Exceptions = utf_new_char("Exceptions");
268 utf_LineNumberTable = utf_new_char("LineNumberTable");
269 utf_SourceFile = utf_new_char("SourceFile");
271 utf_init = utf_new_char("<init>");
272 utf_clinit = utf_new_char("<clinit>");
273 utf_clone = utf_new_char("clone");
274 utf_finalize = utf_new_char("finalize");
275 utf_run = utf_new_char("run");
277 utf_add = utf_new_char("add");
278 utf_remove = utf_new_char("remove");
279 utf_removeThread = utf_new_char("removeThread");
280 utf_put = utf_new_char("put");
281 utf_get = utf_new_char("get");
282 utf_value = utf_new_char("value");
284 utf_printStackTrace = utf_new_char("printStackTrace");
285 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
286 utf_loadClass = utf_new_char("loadClass");
287 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
289 utf_Z = utf_new_char("Z");
290 utf_B = utf_new_char("B");
291 utf_C = utf_new_char("C");
292 utf_S = utf_new_char("S");
293 utf_I = utf_new_char("I");
294 utf_J = utf_new_char("J");
295 utf_F = utf_new_char("F");
296 utf_D = utf_new_char("D");
298 utf_void__void = utf_new_char("()V");
299 utf_boolean__void = utf_new_char("(Z)V");
300 utf_byte__void = utf_new_char("(B)V");
301 utf_char__void = utf_new_char("(C)V");
302 utf_short__void = utf_new_char("(S)V");
303 utf_int__void = utf_new_char("(I)V");
304 utf_long__void = utf_new_char("(J)V");
305 utf_float__void = utf_new_char("(F)V");
306 utf_double__void = utf_new_char("(D)V");
307 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
308 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
310 utf_void__java_lang_ClassLoader =
311 utf_new_char("()Ljava/lang/ClassLoader;");
313 utf_java_lang_Object__java_lang_Object =
314 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
316 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
318 utf_java_lang_String__java_lang_Class =
319 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
321 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
322 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
324 utf_null = utf_new_char("null");
325 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
326 array_packagename = utf_new_char("\t<the array package>");
328 /* everything's ok */
334 /* utf_hashkey *****************************************************************
336 The hashkey is computed from the utf-text by using up to 8
337 characters. For utf-symbols longer than 15 characters 3 characters
338 are taken from the beginning and the end, 2 characters are taken
341 *******************************************************************************/
343 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
344 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
346 u4 utf_hashkey(const char *text, u4 length)
348 const char *start_pos = text; /* pointer to utf text */
352 case 0: /* empty string */
355 case 1: return fbs(0);
356 case 2: return fbs(0) ^ nbs(3);
357 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
358 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
359 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
360 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
361 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
362 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
369 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
378 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
387 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
399 return a ^ nbs(9) ^ nbs(10);
411 return a ^ nbs(9) ^ nbs(10);
422 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
433 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
435 default: /* 3 characters from beginning */
441 /* 2 characters from middle */
442 text = start_pos + (length / 2);
447 /* 3 characters from end */
448 text = start_pos + length - 4;
453 return a ^ nbs(10) ^ nbs(11);
457 /* utf_full_hashkey ************************************************************
459 This function computes a hash value using all bytes in the string.
461 The algorithm is the "One-at-a-time" algorithm as published
462 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
464 *******************************************************************************/
466 u4 utf_full_hashkey(const char *text, u4 length)
468 register const unsigned char *p = (const unsigned char *) text;
476 hash += (hash << 10);
480 hash ^= (hash >> 11);
481 hash += (hash << 15);
486 /* unicode_hashkey *************************************************************
488 Compute the hashkey of a unicode string.
490 *******************************************************************************/
492 u4 unicode_hashkey(u2 *text, u2 len)
494 return utf_hashkey((char *) text, len);
498 /* utf_new *********************************************************************
500 Creates a new utf-symbol, the text of the symbol is passed as a
501 u1-array. The function searches the utf-hashtable for a utf-symbol
502 with this text. On success the element returned, otherwise a new
503 hashtable element is created.
505 If the number of entries in the hashtable exceeds twice the size of
506 the hashtable slots a reorganization of the hashtable is done and
507 the utf symbols are copied to a new hashtable with doubled size.
509 *******************************************************************************/
511 utf *utf_new(const char *text, u2 length)
513 u4 key; /* hashkey computed from utf-text */
514 u4 slot; /* slot in hashtable */
515 utf *u; /* hashtable element */
518 LOCK_MONITOR_ENTER(hashtable_utf->header);
520 #if defined(ENABLE_STATISTICS)
525 key = utf_hashkey(text, length);
526 slot = key & (hashtable_utf->size - 1);
527 u = hashtable_utf->ptr[slot];
529 /* search external hash chain for utf-symbol */
532 if (u->blength == length) {
533 /* compare text of hashtable elements */
535 for (i = 0; i < length; i++)
536 if (text[i] != u->text[i])
539 #if defined(ENABLE_STATISTICS)
541 count_utf_new_found++;
544 /* symbol found in hashtable */
546 LOCK_MONITOR_EXIT(hashtable_utf->header);
552 u = u->hashlink; /* next element in external chain */
555 #if defined(ENABLE_STATISTICS)
557 count_utf_len += sizeof(utf) + length + 1;
560 /* location in hashtable found, create new utf element */
562 u->blength = length; /* length in bytes of utfstring */
563 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
564 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
566 memcpy(u->text, text, length); /* copy utf-text */
567 u->text[length] = '\0';
569 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
570 hashtable_utf->entries++; /* update number of entries */
572 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
574 /* reorganization of hashtable, average length of the external
575 chains is approx. 2 */
577 hashtable *newhash; /* the new hashtable */
583 /* create new hashtable, double the size */
585 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
587 #if defined(ENABLE_STATISTICS)
589 count_utf_len += sizeof(utf*) * hashtable_utf->size;
592 /* transfer elements to new hashtable */
594 for (i = 0; i < hashtable_utf->size; i++) {
595 u = hashtable_utf->ptr[i];
599 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
601 u->hashlink = (utf *) newhash->ptr[slot];
602 newhash->ptr[slot] = u;
604 /* follow link in external hash chain */
610 /* dispose old table */
612 hashtable_free(hashtable_utf);
614 hashtable_utf = newhash;
617 LOCK_MONITOR_EXIT(hashtable_utf->header);
623 /* utf_new_u2 ******************************************************************
625 Make utf symbol from u2 array, if isclassname is true '.' is
628 *******************************************************************************/
630 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
632 char *buffer; /* memory buffer for unicode characters */
633 char *pos; /* pointer to current position in buffer */
634 u4 left; /* unicode characters left */
635 u4 buflength; /* utf length in bytes of the u2 array */
636 utf *result; /* resulting utf-string */
639 /* determine utf length in bytes and allocate memory */
641 buflength = u2_utflength(unicode_pos, unicode_length);
642 buffer = MNEW(char, buflength);
647 for (i = 0; i++ < unicode_length; unicode_pos++) {
648 /* next unicode character */
651 if ((c != 0) && (c < 0x80)) {
654 if ((int) left < 0) break;
655 /* convert classname */
656 if (isclassname && c == '.')
661 } else if (c < 0x800) {
663 unsigned char high = c >> 6;
664 unsigned char low = c & 0x3F;
666 if ((int) left < 0) break;
667 *pos++ = high | 0xC0;
673 char mid = (c >> 6) & 0x3F;
676 if ((int) left < 0) break;
677 *pos++ = high | 0xE0;
683 /* insert utf-string into symbol-table */
684 result = utf_new(buffer,buflength);
686 MFREE(buffer, char, buflength);
692 /* utf_new_char ****************************************************************
694 Creates a new utf symbol, the text for this symbol is passed as a
695 c-string ( = char* ).
697 *******************************************************************************/
699 utf *utf_new_char(const char *text)
701 return utf_new(text, strlen(text));
705 /* utf_new_char_classname ******************************************************
707 Creates a new utf symbol, the text for this symbol is passed as a
708 c-string ( = char* ) "." characters are going to be replaced by
709 "/". Since the above function is used often, this is a separte
710 function, instead of an if.
712 *******************************************************************************/
714 utf *utf_new_char_classname(const char *text)
716 if (strchr(text, '.')) {
717 char *txt = strdup(text);
718 char *end = txt + strlen(txt);
722 for (c = txt; c < end; c++)
723 if (*c == '.') *c = '/';
725 tmpRes = utf_new(txt, strlen(txt));
731 return utf_new(text, strlen(text));
735 /* utf_nextu2 ******************************************************************
737 Read the next unicode character from the utf string and increment
738 the utf-string pointer accordingly.
740 CAUTION: This function is unsafe for input that was not checked
743 *******************************************************************************/
745 u2 utf_nextu2(char **utf_ptr)
747 /* uncompressed unicode character */
749 /* current position in utf text */
750 unsigned char *utf = (unsigned char *) (*utf_ptr);
751 /* bytes representing the unicode character */
752 unsigned char ch1, ch2, ch3;
753 /* number of bytes used to represent the unicode character */
756 switch ((ch1 = utf[0]) >> 4) {
757 default: /* 1 byte */
761 case 0xD: /* 2 bytes */
762 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
763 unsigned char high = ch1 & 0x1F;
764 unsigned char low = ch2 & 0x3F;
765 unicode_char = (high << 6) + low;
770 case 0xE: /* 2 or 3 bytes */
771 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
772 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
773 unsigned char low = ch3 & 0x3f;
774 unsigned char mid = ch2 & 0x3f;
775 unsigned char high = ch1 & 0x0f;
776 unicode_char = (((high << 6) + mid) << 6) + low;
784 /* update position in utf-text */
785 *utf_ptr = (char *) (utf + len);
791 /* utf_bytes *******************************************************************
793 Determine number of bytes (aka. octets) in the utf string.
796 u............utf string
799 The number of octets of this utf string.
800 There is _no_ terminating zero included in this count.
802 *******************************************************************************/
809 /* utf_get_number_of_u2s_for_buffer ********************************************
811 Determine number of UTF-16 u2s in the given UTF-8 buffer
813 CAUTION: This function is unsafe for input that was not checked
816 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
817 to an array of u2s (UTF-16) and want to know how many of them you will get.
818 All other uses of this function are probably wrong.
821 buffer........points to first char in buffer
822 blength.......number of _bytes_ in the buffer
825 the number of u2s needed to hold this string in UTF-16 encoding.
826 There is _no_ terminating zero included in this count.
828 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
831 *******************************************************************************/
833 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
835 const char *endpos; /* points behind utf string */
836 const char *utf_ptr; /* current position in utf text */
837 u4 len = 0; /* number of unicode characters */
840 endpos = utf_ptr + blength;
842 while (utf_ptr < endpos) {
844 /* next unicode character */
845 utf_nextu2((char **)&utf_ptr);
848 assert(utf_ptr == endpos);
854 /* utf_get_number_of_u2s *******************************************************
856 Determine number of UTF-16 u2s in the utf string.
858 CAUTION: This function is unsafe for input that was not checked
861 CAUTION: Use this function *only* when you want to convert a utf string
862 to an array of u2s and want to know how many of them you will get.
863 All other uses of this function are probably wrong.
866 u............utf string
869 the number of u2s needed to hold this string in UTF-16 encoding.
870 There is _no_ terminating zero included in this count.
871 XXX 0 if a NullPointerException has been thrown (see below)
873 *******************************************************************************/
875 u4 utf_get_number_of_u2s(utf *u)
877 char *endpos; /* points behind utf string */
878 char *utf_ptr; /* current position in utf text */
879 u4 len = 0; /* number of unicode characters */
881 /* XXX this is probably not checked by most callers! Review this after */
882 /* the invalid uses of this function have been eliminated */
884 exceptions_throw_nullpointerexception();
891 while (utf_ptr < endpos) {
893 /* next unicode character */
894 utf_nextu2(&utf_ptr);
897 if (utf_ptr != endpos)
898 /* string ended abruptly */
899 throw_cacao_exception_exit(string_java_lang_InternalError,
900 "Illegal utf8 string");
906 /* utf8_safe_number_of_u2s *****************************************************
908 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
909 (For invalid UTF-8 the U+fffd replacement character will be counted.)
911 This function is safe even for invalid UTF-8 strings.
914 text..........zero-terminated UTF-8 string (may be invalid)
918 the number of u2s needed to hold this string in UTF-16 encoding.
919 There is _no_ terminating zero included in this count.
921 *******************************************************************************/
923 s4 utf8_safe_number_of_u2s(const char *text) {
924 register const unsigned char *t;
936 t = (const unsigned char *) text;
938 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
944 /* highest bit set, non-ASCII character */
946 if ((byte & 0xe0) == 0xc0) {
947 /* 2-byte: should be 110..... 10...... ? */
949 if ((*t++ & 0xc0) == 0x80)
954 else if ((byte & 0xf0) == 0xe0) {
955 /* 3-byte: should be 1110.... 10...... 10...... */
957 if ((*t++ & 0xc0) == 0x80) {
958 if ((*t++ & 0xc0) == 0x80)
966 else if ((byte & 0xf8) == 0xf0) {
967 /* 4-byte: should be 11110... 10...... 10...... 10...... */
969 if (((byte1 = *t++) & 0xc0) == 0x80) {
970 if (((byte2 = *t++) & 0xc0) == 0x80) {
971 if (((byte3 = *t++) & 0xc0) == 0x80) {
972 /* valid 4-byte UTF-8? */
973 value = ((byte & 0x07) << 18)
974 | ((byte1 & 0x3f) << 12)
975 | ((byte2 & 0x3f) << 6)
978 if (value > 0x10FFFF)
980 else if (value > 0xFFFF)
981 len += 1; /* we need surrogates */
983 ; /* 16bit suffice */
994 else if ((byte & 0xfc) == 0xf8) {
997 for (; skip && (*t & 0x80); --skip)
1000 else if ((byte & 0xfe) == 0xfc) {
1001 /* invalid 6-byte */
1003 for (; skip && (*t & 0x80); --skip)
1015 /* ASCII character, common case */
1025 /* utf8_safe_convert_to_u2s ****************************************************
1027 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1028 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1029 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1031 This function is safe even for invalid UTF-8 strings.
1034 text..........zero-terminated UTF-8 string (may be invalid)
1037 *******************************************************************************/
1039 #define UNICODE_REPLACEMENT 0xfffd
1041 void utf8_safe_convert_to_u2s(const char *text, u2 *buffer) {
1042 register const unsigned char *t;
1052 t = (const unsigned char *) text;
1054 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1060 /* highest bit set, non-ASCII character */
1062 if ((byte & 0xe0) == 0xc0) {
1063 /* 2-byte: should be 110..... 10...... */
1065 if (((byte1 = *t++) & 0xc0) == 0x80) {
1066 /* valid 2-byte UTF-8 */
1067 *buffer++ = ((byte & 0x1f) << 6)
1068 | ((byte1 & 0x3f) );
1071 *buffer++ = UNICODE_REPLACEMENT;
1075 else if ((byte & 0xf0) == 0xe0) {
1076 /* 3-byte: should be 1110.... 10...... 10...... */
1078 if (((byte1 = *t++) & 0xc0) == 0x80) {
1079 if (((byte2 = *t++) & 0xc0) == 0x80) {
1080 /* valid 3-byte UTF-8 */
1081 *buffer++ = ((byte & 0x0f) << 12)
1082 | ((byte1 & 0x3f) << 6)
1083 | ((byte2 & 0x3f) );
1086 *buffer++ = UNICODE_REPLACEMENT;
1091 *buffer++ = UNICODE_REPLACEMENT;
1095 else if ((byte & 0xf8) == 0xf0) {
1096 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1098 if (((byte1 = *t++) & 0xc0) == 0x80) {
1099 if (((byte2 = *t++) & 0xc0) == 0x80) {
1100 if (((byte3 = *t++) & 0xc0) == 0x80) {
1101 /* valid 4-byte UTF-8? */
1102 value = ((byte & 0x07) << 18)
1103 | ((byte1 & 0x3f) << 12)
1104 | ((byte2 & 0x3f) << 6)
1105 | ((byte3 & 0x3f) );
1107 if (value > 0x10FFFF) {
1108 *buffer++ = UNICODE_REPLACEMENT;
1110 else if (value > 0xFFFF) {
1111 /* we need surrogates */
1112 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1113 *buffer++ = 0xdc00 | (value & 0x03ff);
1116 *buffer++ = value; /* 16bit suffice */
1119 *buffer++ = UNICODE_REPLACEMENT;
1124 *buffer++ = UNICODE_REPLACEMENT;
1129 *buffer++ = UNICODE_REPLACEMENT;
1133 else if ((byte & 0xfc) == 0xf8) {
1135 for (; skip && (*t & 0x80); --skip)
1137 *buffer++ = UNICODE_REPLACEMENT;
1139 else if ((byte & 0xfe) == 0xfc) {
1141 for (; skip && (*t & 0x80); --skip)
1143 *buffer++ = UNICODE_REPLACEMENT;
1146 *buffer++ = UNICODE_REPLACEMENT;
1154 /* ASCII character, common case */
1162 /* u2_utflength ****************************************************************
1164 Returns the utf length in bytes of a u2 array.
1166 *******************************************************************************/
1168 u4 u2_utflength(u2 *text, u4 u2_length)
1170 u4 result_len = 0; /* utf length in bytes */
1171 u2 ch; /* current unicode character */
1174 for (len = 0; len < u2_length; len++) {
1175 /* next unicode character */
1178 /* determine bytes required to store unicode character as utf */
1179 if (ch && (ch < 0x80))
1181 else if (ch < 0x800)
1191 /* utf_copy ********************************************************************
1193 Copy the given utf string byte-for-byte to a buffer.
1196 buffer.......the buffer
1197 u............the utf string
1199 *******************************************************************************/
1201 void utf_copy(char *buffer, utf *u)
1203 /* our utf strings are zero-terminated (done by utf_new) */
1204 MCOPY(buffer, u->text, char, u->blength + 1);
1208 /* utf_cat *********************************************************************
1210 Append the given utf string byte-for-byte to a buffer.
1213 buffer.......the buffer
1214 u............the utf string
1216 *******************************************************************************/
1218 void utf_cat(char *buffer, utf *u)
1220 /* our utf strings are zero-terminated (done by utf_new) */
1221 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1225 /* utf_copy_classname **********************************************************
1227 Copy the given utf classname byte-for-byte to a buffer.
1228 '/' is replaced by '.'
1231 buffer.......the buffer
1232 u............the utf string
1234 *******************************************************************************/
1236 void utf_copy_classname(char *buffer, utf *u)
1245 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1247 while (srcptr != endptr) {
1256 /* utf_cat *********************************************************************
1258 Append the given utf classname byte-for-byte to a buffer.
1259 '/' is replaced by '.'
1262 buffer.......the buffer
1263 u............the utf string
1265 *******************************************************************************/
1267 void utf_cat_classname(char *buffer, utf *u)
1269 utf_copy_classname(buffer + strlen(buffer), u);
1272 /* utf_display_printable_ascii *************************************************
1274 Write utf symbol to stdout (for debugging purposes).
1275 Non-printable and non-ASCII characters are printed as '?'.
1277 *******************************************************************************/
1279 void utf_display_printable_ascii(utf *u)
1281 char *endpos; /* points behind utf string */
1282 char *utf_ptr; /* current position in utf text */
1290 endpos = UTF_END(u);
1293 while (utf_ptr < endpos) {
1294 /* read next unicode character */
1296 u2 c = utf_nextu2(&utf_ptr);
1298 if ((c >= 32) && (c <= 127))
1308 /* utf_display_printable_ascii_classname ***************************************
1310 Write utf symbol to stdout with `/' converted to `.' (for debugging
1312 Non-printable and non-ASCII characters are printed as '?'.
1314 *******************************************************************************/
1316 void utf_display_printable_ascii_classname(utf *u)
1318 char *endpos; /* points behind utf string */
1319 char *utf_ptr; /* current position in utf text */
1327 endpos = UTF_END(u);
1330 while (utf_ptr < endpos) {
1331 /* read next unicode character */
1333 u2 c = utf_nextu2(&utf_ptr);
1338 if ((c >= 32) && (c <= 127))
1348 /* utf_sprint_convert_to_latin1 ************************************************
1350 Write utf symbol into c-string (for debugging purposes).
1351 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1354 *******************************************************************************/
1356 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1358 char *endpos; /* points behind utf string */
1359 char *utf_ptr; /* current position in utf text */
1360 u2 pos = 0; /* position in c-string */
1363 strcpy(buffer, "NULL");
1367 endpos = UTF_END(u);
1370 while (utf_ptr < endpos)
1371 /* copy next unicode character */
1372 buffer[pos++] = utf_nextu2(&utf_ptr);
1374 /* terminate string */
1379 /* utf_sprint_convert_to_latin1_classname **************************************
1381 Write utf symbol into c-string with `/' converted to `.' (for debugging
1383 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1386 *******************************************************************************/
1388 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1390 char *endpos; /* points behind utf string */
1391 char *utf_ptr; /* current position in utf text */
1392 u2 pos = 0; /* position in c-string */
1395 strcpy(buffer, "NULL");
1399 endpos = UTF_END(u);
1402 while (utf_ptr < endpos) {
1403 /* copy next unicode character */
1404 u2 c = utf_nextu2(&utf_ptr);
1405 if (c == '/') c = '.';
1409 /* terminate string */
1414 /* utf_strcat_convert_to_latin1 ************************************************
1416 Like libc strcat, but uses an utf8 string.
1417 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1420 *******************************************************************************/
1422 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1424 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1428 /* utf_strcat_convert_to_latin1_classname **************************************
1430 Like libc strcat, but uses an utf8 string.
1431 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1434 *******************************************************************************/
1436 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1438 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1442 /* utf_fprint_printable_ascii **************************************************
1444 Write utf symbol into file.
1445 Non-printable and non-ASCII characters are printed as '?'.
1447 *******************************************************************************/
1449 void utf_fprint_printable_ascii(FILE *file, utf *u)
1451 char *endpos; /* points behind utf string */
1452 char *utf_ptr; /* current position in utf text */
1457 endpos = UTF_END(u);
1460 while (utf_ptr < endpos) {
1461 /* read next unicode character */
1462 u2 c = utf_nextu2(&utf_ptr);
1464 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1465 else fprintf(file, "?");
1470 /* utf_fprint_printable_ascii_classname ****************************************
1472 Write utf symbol into file with `/' converted to `.'.
1473 Non-printable and non-ASCII characters are printed as '?'.
1475 *******************************************************************************/
1477 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1479 char *endpos; /* points behind utf string */
1480 char *utf_ptr; /* current position in utf text */
1485 endpos = UTF_END(u);
1488 while (utf_ptr < endpos) {
1489 /* read next unicode character */
1490 u2 c = utf_nextu2(&utf_ptr);
1491 if (c == '/') c = '.';
1493 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1494 else fprintf(file, "?");
1499 /* is_valid_utf ****************************************************************
1501 Return true if the given string is a valid UTF-8 string.
1503 utf_ptr...points to first character
1504 end_pos...points after last character
1506 *******************************************************************************/
1508 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1510 bool is_valid_utf(char *utf_ptr, char *end_pos)
1517 if (end_pos < utf_ptr) return false;
1518 bytes = end_pos - utf_ptr;
1522 if (!c) return false; /* 0x00 is not allowed */
1523 if ((c & 0x80) == 0) continue; /* ASCII */
1525 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1526 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1527 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1528 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1529 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1530 else return false; /* invalid leading byte */
1532 if (len > 2) return false; /* Java limitation */
1534 v = (unsigned long)c & (0x3f >> len);
1536 if ((bytes -= len) < 0) return false; /* missing bytes */
1538 for (i = len; i--; ) {
1540 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1542 v = (v << 6) | (c & 0x3f);
1546 if (len != 1) return false; /* Java special */
1549 /* Sun Java seems to allow overlong UTF-8 encodings */
1551 /* if (v < min_codepoint[len]) */
1552 /* XXX throw exception? */
1555 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1556 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1558 /* even these seem to be allowed */
1559 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1566 /* is_valid_name ***************************************************************
1568 Return true if the given string may be used as a class/field/method
1569 name. (Currently this only disallows empty strings and control
1572 NOTE: The string is assumed to have passed is_valid_utf!
1574 utf_ptr...points to first character
1575 end_pos...points after last character
1577 *******************************************************************************/
1579 bool is_valid_name(char *utf_ptr, char *end_pos)
1581 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1583 while (utf_ptr < end_pos) {
1584 unsigned char c = *utf_ptr++;
1586 if (c < 0x20) return false; /* disallow control characters */
1587 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1594 bool is_valid_name_utf(utf *u)
1596 return is_valid_name(u->text, UTF_END(u));
1600 /* utf_show ********************************************************************
1602 Writes the utf symbols in the utfhash to stdout and displays the
1603 number of external hash chains grouped according to the chainlength
1604 (for debugging purposes).
1606 *******************************************************************************/
1608 #if !defined(NDEBUG)
1612 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1614 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1615 u4 max_chainlength = 0; /* maximum length of the chains */
1616 u4 sum_chainlength = 0; /* sum of the chainlengths */
1617 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1620 printf("UTF-HASH:\n");
1622 /* show element of utf-hashtable */
1624 for (i = 0; i < hashtable_utf->size; i++) {
1625 utf *u = hashtable_utf->ptr[i];
1628 printf("SLOT %d: ", (int) i);
1632 utf_display_printable_ascii(u);
1640 printf("UTF-HASH: %d slots for %d entries\n",
1641 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1643 if (hashtable_utf->entries == 0)
1646 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1648 for (i=0;i<CHAIN_LIMIT;i++)
1651 /* count numbers of hashchains according to their length */
1652 for (i=0; i<hashtable_utf->size; i++) {
1654 utf *u = (utf*) hashtable_utf->ptr[i];
1655 u4 chain_length = 0;
1657 /* determine chainlength */
1663 /* update sum of all chainlengths */
1664 sum_chainlength+=chain_length;
1666 /* determine the maximum length of the chains */
1667 if (chain_length>max_chainlength)
1668 max_chainlength = chain_length;
1670 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1671 if (chain_length>=CHAIN_LIMIT) {
1672 beyond_limit+=chain_length;
1673 chain_length=CHAIN_LIMIT-1;
1676 /* update number of hashchains of current length */
1677 chain_count[chain_length]++;
1680 /* display results */
1681 for (i=1;i<CHAIN_LIMIT-1;i++)
1682 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1684 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1687 printf("max. chainlength:%5d\n",max_chainlength);
1689 /* avg. chainlength = sum of chainlengths / number of chains */
1690 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1692 #endif /* !defined(NDEBUG) */
1696 * These are local overrides for various environment variables in Emacs.
1697 * Please do not remove this and leave it at the end of the file, where
1698 * Emacs will automagically detect them.
1699 * ---------------------------------------------------------------------
1702 * indent-tabs-mode: t
1706 * vim:noexpandtab:sw=4:ts=4: