1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
34 $Id: utf8.c 5920 2006-11-05 21:23:09Z twisti $
46 #include "mm/memory.h"
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/lock.h"
51 # include "threads/none/lock.h"
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
63 /* global variables ***********************************************************/
65 /* hashsize must be power of 2 */
67 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
69 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
72 /* utf-symbols for pointer comparison of frequently used strings **************/
74 utf *utf_java_lang_Object;
76 utf *utf_java_lang_Class;
77 utf *utf_java_lang_ClassLoader;
78 utf *utf_java_lang_Cloneable;
79 utf *utf_java_lang_SecurityManager;
80 utf *utf_java_lang_String;
81 utf *utf_java_lang_System;
82 utf *utf_java_lang_ThreadGroup;
83 utf *utf_java_io_Serializable;
85 utf *utf_java_lang_Throwable;
86 utf *utf_java_lang_VMThrowable;
87 utf *utf_java_lang_Error;
88 utf *utf_java_lang_AbstractMethodError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchMethodError;
92 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ClassCastException;
96 utf *utf_java_lang_ClassNotFoundException;
97 utf *utf_java_lang_IllegalArgumentException;
98 utf *utf_java_lang_IllegalMonitorStateException;
100 utf *utf_java_lang_NullPointerException;
102 utf* utf_java_lang_Void;
103 utf* utf_java_lang_Boolean;
104 utf* utf_java_lang_Byte;
105 utf* utf_java_lang_Character;
106 utf* utf_java_lang_Short;
107 utf* utf_java_lang_Integer;
108 utf* utf_java_lang_Long;
109 utf* utf_java_lang_Float;
110 utf* utf_java_lang_Double;
112 utf *utf_java_lang_StackTraceElement;
113 utf *utf_java_lang_reflect_Constructor;
114 utf *utf_java_lang_reflect_Field;
115 utf *utf_java_lang_reflect_Method;
116 utf *utf_java_util_Vector;
118 utf *utf_InnerClasses; /* InnerClasses */
119 utf *utf_ConstantValue; /* ConstantValue */
120 utf *utf_Code; /* Code */
121 utf *utf_Exceptions; /* Exceptions */
122 utf *utf_LineNumberTable; /* LineNumberTable */
123 utf *utf_SourceFile; /* SourceFile */
126 utf *utf_init; /* <init> */
127 utf *utf_clinit; /* <clinit> */
128 utf *utf_clone; /* clone */
129 utf *utf_finalize; /* finalize */
130 utf *utf_run; /* run */
134 utf *utf_removeThread;
139 utf *utf_fillInStackTrace;
140 utf *utf_getSystemClassLoader;
142 utf *utf_printStackTrace;
153 utf *utf_void__void; /* ()V */
154 utf *utf_boolean__void; /* (Z)V */
155 utf *utf_byte__void; /* (B)V */
156 utf *utf_char__void; /* (C)V */
157 utf *utf_short__void; /* (S)V */
158 utf *utf_int__void; /* (I)V */
159 utf *utf_long__void; /* (J)V */
160 utf *utf_float__void; /* (F)V */
161 utf *utf_double__void; /* (D)V */
163 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
164 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
165 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
166 utf *utf_java_lang_Object__java_lang_Object;
167 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
168 utf *utf_java_lang_String__java_lang_Class;
169 utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */
170 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
172 utf *utf_not_named_yet; /* special name for unnamed classes */
174 utf *array_packagename;
177 /* utf_init ********************************************************************
179 Initializes the utf8 subsystem.
181 *******************************************************************************/
185 /* create utf8 hashtable */
187 hashtable_utf = NEW(hashtable);
189 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
191 #if defined(ENABLE_STATISTICS)
193 count_utf_len += sizeof(utf*) * hashtable_utf->size;
196 /* create utf-symbols for pointer comparison of frequently used strings */
198 utf_java_lang_Object = utf_new_char("java/lang/Object");
200 utf_java_lang_Class = utf_new_char("java/lang/Class");
201 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
202 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
203 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
204 utf_java_lang_String = utf_new_char("java/lang/String");
205 utf_java_lang_System = utf_new_char("java/lang/System");
206 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
207 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
209 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
210 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
211 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
213 utf_java_lang_AbstractMethodError =
214 utf_new_char(string_java_lang_AbstractMethodError);
216 utf_java_lang_LinkageError =
217 utf_new_char(string_java_lang_LinkageError);
219 utf_java_lang_NoClassDefFoundError =
220 utf_new_char(string_java_lang_NoClassDefFoundError);
222 utf_java_lang_NoSuchMethodError =
223 utf_new_char(string_java_lang_NoSuchMethodError);
225 utf_java_lang_OutOfMemoryError =
226 utf_new_char(string_java_lang_OutOfMemoryError);
228 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
230 utf_java_lang_ClassCastException =
231 utf_new_char(string_java_lang_ClassCastException);
233 utf_java_lang_ClassNotFoundException =
234 utf_new_char(string_java_lang_ClassNotFoundException);
236 utf_java_lang_IllegalArgumentException =
237 utf_new_char(string_java_lang_IllegalArgumentException);
239 utf_java_lang_IllegalMonitorStateException =
240 utf_new_char(string_java_lang_IllegalMonitorStateException);
242 utf_java_lang_NullPointerException =
243 utf_new_char(string_java_lang_NullPointerException);
245 utf_java_lang_Void = utf_new_char("java/lang/Void");
246 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
247 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
248 utf_java_lang_Character = utf_new_char("java/lang/Character");
249 utf_java_lang_Short = utf_new_char("java/lang/Short");
250 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
251 utf_java_lang_Long = utf_new_char("java/lang/Long");
252 utf_java_lang_Float = utf_new_char("java/lang/Float");
253 utf_java_lang_Double = utf_new_char("java/lang/Double");
255 utf_java_lang_StackTraceElement =
256 utf_new_char("java/lang/StackTraceElement");
258 utf_java_lang_reflect_Constructor =
259 utf_new_char("java/lang/reflect/Constructor");
261 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
262 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
263 utf_java_util_Vector = utf_new_char("java/util/Vector");
265 utf_InnerClasses = utf_new_char("InnerClasses");
266 utf_ConstantValue = utf_new_char("ConstantValue");
267 utf_Code = utf_new_char("Code");
268 utf_Exceptions = utf_new_char("Exceptions");
269 utf_LineNumberTable = utf_new_char("LineNumberTable");
270 utf_SourceFile = utf_new_char("SourceFile");
271 utf_Signature = utf_new_char("Signature");
273 utf_init = utf_new_char("<init>");
274 utf_clinit = utf_new_char("<clinit>");
275 utf_clone = utf_new_char("clone");
276 utf_finalize = utf_new_char("finalize");
277 utf_run = utf_new_char("run");
279 utf_add = utf_new_char("add");
280 utf_remove = utf_new_char("remove");
281 utf_removeThread = utf_new_char("removeThread");
282 utf_put = utf_new_char("put");
283 utf_get = utf_new_char("get");
284 utf_value = utf_new_char("value");
286 utf_printStackTrace = utf_new_char("printStackTrace");
287 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
288 utf_loadClass = utf_new_char("loadClass");
289 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
291 utf_Z = utf_new_char("Z");
292 utf_B = utf_new_char("B");
293 utf_C = utf_new_char("C");
294 utf_S = utf_new_char("S");
295 utf_I = utf_new_char("I");
296 utf_J = utf_new_char("J");
297 utf_F = utf_new_char("F");
298 utf_D = utf_new_char("D");
300 utf_void__void = utf_new_char("()V");
301 utf_boolean__void = utf_new_char("(Z)V");
302 utf_byte__void = utf_new_char("(B)V");
303 utf_char__void = utf_new_char("(C)V");
304 utf_short__void = utf_new_char("(S)V");
305 utf_int__void = utf_new_char("(I)V");
306 utf_long__void = utf_new_char("(J)V");
307 utf_float__void = utf_new_char("(F)V");
308 utf_double__void = utf_new_char("(D)V");
309 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
310 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
312 utf_void__java_lang_ClassLoader =
313 utf_new_char("()Ljava/lang/ClassLoader;");
315 utf_java_lang_Object__java_lang_Object =
316 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
318 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
320 utf_java_lang_String__java_lang_Class =
321 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
323 utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V");
324 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
326 utf_null = utf_new_char("null");
327 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
328 array_packagename = utf_new_char("\t<the array package>");
330 /* everything's ok */
336 /* utf_hashkey *****************************************************************
338 The hashkey is computed from the utf-text by using up to 8
339 characters. For utf-symbols longer than 15 characters 3 characters
340 are taken from the beginning and the end, 2 characters are taken
343 *******************************************************************************/
345 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
346 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
348 u4 utf_hashkey(const char *text, u4 length)
350 const char *start_pos = text; /* pointer to utf text */
354 case 0: /* empty string */
357 case 1: return fbs(0);
358 case 2: return fbs(0) ^ nbs(3);
359 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
360 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
361 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
362 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
363 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
364 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
371 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
380 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
389 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
401 return a ^ nbs(9) ^ nbs(10);
413 return a ^ nbs(9) ^ nbs(10);
424 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
435 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
437 default: /* 3 characters from beginning */
443 /* 2 characters from middle */
444 text = start_pos + (length / 2);
449 /* 3 characters from end */
450 text = start_pos + length - 4;
455 return a ^ nbs(10) ^ nbs(11);
459 /* utf_full_hashkey ************************************************************
461 This function computes a hash value using all bytes in the string.
463 The algorithm is the "One-at-a-time" algorithm as published
464 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
466 *******************************************************************************/
468 u4 utf_full_hashkey(const char *text, u4 length)
470 register const unsigned char *p = (const unsigned char *) text;
478 hash += (hash << 10);
482 hash ^= (hash >> 11);
483 hash += (hash << 15);
488 /* unicode_hashkey *************************************************************
490 Compute the hashkey of a unicode string.
492 *******************************************************************************/
494 u4 unicode_hashkey(u2 *text, u2 len)
496 return utf_hashkey((char *) text, len);
500 /* utf_new *********************************************************************
502 Creates a new utf-symbol, the text of the symbol is passed as a
503 u1-array. The function searches the utf-hashtable for a utf-symbol
504 with this text. On success the element returned, otherwise a new
505 hashtable element is created.
507 If the number of entries in the hashtable exceeds twice the size of
508 the hashtable slots a reorganization of the hashtable is done and
509 the utf symbols are copied to a new hashtable with doubled size.
511 *******************************************************************************/
513 utf *utf_new(const char *text, u2 length)
515 u4 key; /* hashkey computed from utf-text */
516 u4 slot; /* slot in hashtable */
517 utf *u; /* hashtable element */
520 LOCK_MONITOR_ENTER(hashtable_utf->header);
522 #if defined(ENABLE_STATISTICS)
527 key = utf_hashkey(text, length);
528 slot = key & (hashtable_utf->size - 1);
529 u = hashtable_utf->ptr[slot];
531 /* search external hash chain for utf-symbol */
534 if (u->blength == length) {
535 /* compare text of hashtable elements */
537 for (i = 0; i < length; i++)
538 if (text[i] != u->text[i])
541 #if defined(ENABLE_STATISTICS)
543 count_utf_new_found++;
546 /* symbol found in hashtable */
548 LOCK_MONITOR_EXIT(hashtable_utf->header);
554 u = u->hashlink; /* next element in external chain */
557 #if defined(ENABLE_STATISTICS)
559 count_utf_len += sizeof(utf) + length + 1;
562 /* location in hashtable found, create new utf element */
564 u->blength = length; /* length in bytes of utfstring */
565 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
566 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
568 memcpy(u->text, text, length); /* copy utf-text */
569 u->text[length] = '\0';
571 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
572 hashtable_utf->entries++; /* update number of entries */
574 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
576 /* reorganization of hashtable, average length of the external
577 chains is approx. 2 */
579 hashtable *newhash; /* the new hashtable */
585 /* create new hashtable, double the size */
587 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
589 #if defined(ENABLE_STATISTICS)
591 count_utf_len += sizeof(utf*) * hashtable_utf->size;
594 /* transfer elements to new hashtable */
596 for (i = 0; i < hashtable_utf->size; i++) {
597 u = hashtable_utf->ptr[i];
601 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
603 u->hashlink = (utf *) newhash->ptr[slot];
604 newhash->ptr[slot] = u;
606 /* follow link in external hash chain */
612 /* dispose old table */
614 hashtable_free(hashtable_utf);
616 hashtable_utf = newhash;
619 LOCK_MONITOR_EXIT(hashtable_utf->header);
625 /* utf_new_u2 ******************************************************************
627 Make utf symbol from u2 array, if isclassname is true '.' is
630 *******************************************************************************/
632 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
634 char *buffer; /* memory buffer for unicode characters */
635 char *pos; /* pointer to current position in buffer */
636 u4 left; /* unicode characters left */
637 u4 buflength; /* utf length in bytes of the u2 array */
638 utf *result; /* resulting utf-string */
641 /* determine utf length in bytes and allocate memory */
643 buflength = u2_utflength(unicode_pos, unicode_length);
644 buffer = MNEW(char, buflength);
649 for (i = 0; i++ < unicode_length; unicode_pos++) {
650 /* next unicode character */
653 if ((c != 0) && (c < 0x80)) {
656 if ((int) left < 0) break;
657 /* convert classname */
658 if (isclassname && c == '.')
663 } else if (c < 0x800) {
665 unsigned char high = c >> 6;
666 unsigned char low = c & 0x3F;
668 if ((int) left < 0) break;
669 *pos++ = high | 0xC0;
675 char mid = (c >> 6) & 0x3F;
678 if ((int) left < 0) break;
679 *pos++ = high | 0xE0;
685 /* insert utf-string into symbol-table */
686 result = utf_new(buffer,buflength);
688 MFREE(buffer, char, buflength);
694 /* utf_new_char ****************************************************************
696 Creates a new utf symbol, the text for this symbol is passed as a
697 c-string ( = char* ).
699 *******************************************************************************/
701 utf *utf_new_char(const char *text)
703 return utf_new(text, strlen(text));
707 /* utf_new_char_classname ******************************************************
709 Creates a new utf symbol, the text for this symbol is passed as a
710 c-string ( = char* ) "." characters are going to be replaced by
711 "/". Since the above function is used often, this is a separte
712 function, instead of an if.
714 *******************************************************************************/
716 utf *utf_new_char_classname(const char *text)
718 if (strchr(text, '.')) {
719 char *txt = strdup(text);
720 char *end = txt + strlen(txt);
724 for (c = txt; c < end; c++)
725 if (*c == '.') *c = '/';
727 tmpRes = utf_new(txt, strlen(txt));
733 return utf_new(text, strlen(text));
737 /* utf_nextu2 ******************************************************************
739 Read the next unicode character from the utf string and increment
740 the utf-string pointer accordingly.
742 CAUTION: This function is unsafe for input that was not checked
745 *******************************************************************************/
747 u2 utf_nextu2(char **utf_ptr)
749 /* uncompressed unicode character */
751 /* current position in utf text */
752 unsigned char *utf = (unsigned char *) (*utf_ptr);
753 /* bytes representing the unicode character */
754 unsigned char ch1, ch2, ch3;
755 /* number of bytes used to represent the unicode character */
758 switch ((ch1 = utf[0]) >> 4) {
759 default: /* 1 byte */
763 case 0xD: /* 2 bytes */
764 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
765 unsigned char high = ch1 & 0x1F;
766 unsigned char low = ch2 & 0x3F;
767 unicode_char = (high << 6) + low;
772 case 0xE: /* 2 or 3 bytes */
773 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
774 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
775 unsigned char low = ch3 & 0x3f;
776 unsigned char mid = ch2 & 0x3f;
777 unsigned char high = ch1 & 0x0f;
778 unicode_char = (((high << 6) + mid) << 6) + low;
786 /* update position in utf-text */
787 *utf_ptr = (char *) (utf + len);
793 /* utf_bytes *******************************************************************
795 Determine number of bytes (aka. octets) in the utf string.
798 u............utf string
801 The number of octets of this utf string.
802 There is _no_ terminating zero included in this count.
804 *******************************************************************************/
811 /* utf_get_number_of_u2s_for_buffer ********************************************
813 Determine number of UTF-16 u2s in the given UTF-8 buffer
815 CAUTION: This function is unsafe for input that was not checked
818 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
819 to an array of u2s (UTF-16) and want to know how many of them you will get.
820 All other uses of this function are probably wrong.
823 buffer........points to first char in buffer
824 blength.......number of _bytes_ in the buffer
827 the number of u2s needed to hold this string in UTF-16 encoding.
828 There is _no_ terminating zero included in this count.
830 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
833 *******************************************************************************/
835 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
837 const char *endpos; /* points behind utf string */
838 const char *utf_ptr; /* current position in utf text */
839 u4 len = 0; /* number of unicode characters */
842 endpos = utf_ptr + blength;
844 while (utf_ptr < endpos) {
846 /* next unicode character */
847 utf_nextu2((char **)&utf_ptr);
850 assert(utf_ptr == endpos);
856 /* utf_get_number_of_u2s *******************************************************
858 Determine number of UTF-16 u2s in the utf string.
860 CAUTION: This function is unsafe for input that was not checked
863 CAUTION: Use this function *only* when you want to convert a utf string
864 to an array of u2s and want to know how many of them you will get.
865 All other uses of this function are probably wrong.
868 u............utf string
871 the number of u2s needed to hold this string in UTF-16 encoding.
872 There is _no_ terminating zero included in this count.
873 XXX 0 if a NullPointerException has been thrown (see below)
875 *******************************************************************************/
877 u4 utf_get_number_of_u2s(utf *u)
879 char *endpos; /* points behind utf string */
880 char *utf_ptr; /* current position in utf text */
881 u4 len = 0; /* number of unicode characters */
883 /* XXX this is probably not checked by most callers! Review this after */
884 /* the invalid uses of this function have been eliminated */
886 exceptions_throw_nullpointerexception();
893 while (utf_ptr < endpos) {
895 /* next unicode character */
896 utf_nextu2(&utf_ptr);
899 if (utf_ptr != endpos)
900 /* string ended abruptly */
901 throw_cacao_exception_exit(string_java_lang_InternalError,
902 "Illegal utf8 string");
908 /* utf8_safe_number_of_u2s *****************************************************
910 Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
911 (For invalid UTF-8 the U+fffd replacement character will be counted.)
913 This function is safe even for invalid UTF-8 strings.
916 text..........zero-terminated(!) UTF-8 string (may be invalid)
918 nbytes........strlen(text). (This is needed to completely emulate
922 the number of u2s needed to hold this string in UTF-16 encoding.
923 There is _no_ terminating zero included in this count.
925 *******************************************************************************/
927 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
928 register const unsigned char *t;
931 register const unsigned char *tlimit;
942 t = (const unsigned char *) text;
945 /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
951 /* highest bit set, non-ASCII character */
953 if ((byte & 0xe0) == 0xc0) {
954 /* 2-byte: should be 110..... 10...... ? */
956 if ((*t++ & 0xc0) == 0x80)
961 else if ((byte & 0xf0) == 0xe0) {
962 /* 3-byte: should be 1110.... 10...... 10...... */
966 return len + 1; /* invalid, stop here */
968 if ((*t++ & 0xc0) == 0x80) {
969 if ((*t++ & 0xc0) == 0x80)
977 else if ((byte & 0xf8) == 0xf0) {
978 /* 4-byte: should be 11110... 10...... 10...... 10...... */
982 return len + 1; /* invalid, stop here */
984 if (((byte1 = *t++) & 0xc0) == 0x80) {
985 if (((byte2 = *t++) & 0xc0) == 0x80) {
986 if (((byte3 = *t++) & 0xc0) == 0x80) {
987 /* valid 4-byte UTF-8? */
988 value = ((byte & 0x07) << 18)
989 | ((byte1 & 0x3f) << 12)
990 | ((byte2 & 0x3f) << 6)
993 if (value > 0x10FFFF)
995 else if (value > 0xFFFF)
996 len += 1; /* we need surrogates */
998 ; /* 16bit suffice */
1009 else if ((byte & 0xfc) == 0xf8) {
1010 /* invalid 5-byte */
1012 return len + 1; /* invalid, stop here */
1015 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1018 else if ((byte & 0xfe) == 0xfc) {
1019 /* invalid 6-byte */
1021 return len + 1; /* invalid, stop here */
1024 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1036 /* ASCII character, common case */
1046 /* utf8_safe_convert_to_u2s ****************************************************
1048 Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1049 (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1050 Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1052 This function is safe even for invalid UTF-8 strings.
1055 text..........zero-terminated(!) UTF-8 string (may be invalid)
1057 nbytes........strlen(text). (This is needed to completely emulate
1059 buffer........a preallocated array of u2s to receive the decoded
1060 string. Use utf8_safe_number_of_u2s to get the
1061 required number of u2s for allocating this.
1063 *******************************************************************************/
1065 #define UNICODE_REPLACEMENT 0xfffd
1067 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1068 register const unsigned char *t;
1070 register const unsigned char *tlimit;
1078 assert(nbytes >= 0);
1080 t = (const unsigned char *) text;
1081 tlimit = t + nbytes;
1083 /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1089 /* highest bit set, non-ASCII character */
1091 if ((byte & 0xe0) == 0xc0) {
1092 /* 2-byte: should be 110..... 10...... */
1094 if (((byte1 = *t++) & 0xc0) == 0x80) {
1095 /* valid 2-byte UTF-8 */
1096 *buffer++ = ((byte & 0x1f) << 6)
1097 | ((byte1 & 0x3f) );
1100 *buffer++ = UNICODE_REPLACEMENT;
1104 else if ((byte & 0xf0) == 0xe0) {
1105 /* 3-byte: should be 1110.... 10...... 10...... */
1107 if (t + 2 > tlimit) {
1108 *buffer++ = UNICODE_REPLACEMENT;
1112 if (((byte1 = *t++) & 0xc0) == 0x80) {
1113 if (((byte2 = *t++) & 0xc0) == 0x80) {
1114 /* valid 3-byte UTF-8 */
1115 *buffer++ = ((byte & 0x0f) << 12)
1116 | ((byte1 & 0x3f) << 6)
1117 | ((byte2 & 0x3f) );
1120 *buffer++ = UNICODE_REPLACEMENT;
1125 *buffer++ = UNICODE_REPLACEMENT;
1129 else if ((byte & 0xf8) == 0xf0) {
1130 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1132 if (t + 3 > tlimit) {
1133 *buffer++ = UNICODE_REPLACEMENT;
1137 if (((byte1 = *t++) & 0xc0) == 0x80) {
1138 if (((byte2 = *t++) & 0xc0) == 0x80) {
1139 if (((byte3 = *t++) & 0xc0) == 0x80) {
1140 /* valid 4-byte UTF-8? */
1141 value = ((byte & 0x07) << 18)
1142 | ((byte1 & 0x3f) << 12)
1143 | ((byte2 & 0x3f) << 6)
1144 | ((byte3 & 0x3f) );
1146 if (value > 0x10FFFF) {
1147 *buffer++ = UNICODE_REPLACEMENT;
1149 else if (value > 0xFFFF) {
1150 /* we need surrogates */
1151 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1152 *buffer++ = 0xdc00 | (value & 0x03ff);
1155 *buffer++ = value; /* 16bit suffice */
1158 *buffer++ = UNICODE_REPLACEMENT;
1163 *buffer++ = UNICODE_REPLACEMENT;
1168 *buffer++ = UNICODE_REPLACEMENT;
1172 else if ((byte & 0xfc) == 0xf8) {
1173 if (t + 4 > tlimit) {
1174 *buffer++ = UNICODE_REPLACEMENT;
1179 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1181 *buffer++ = UNICODE_REPLACEMENT;
1183 else if ((byte & 0xfe) == 0xfc) {
1184 if (t + 5 > tlimit) {
1185 *buffer++ = UNICODE_REPLACEMENT;
1190 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1192 *buffer++ = UNICODE_REPLACEMENT;
1195 *buffer++ = UNICODE_REPLACEMENT;
1203 /* ASCII character, common case */
1211 /* u2_utflength ****************************************************************
1213 Returns the utf length in bytes of a u2 array.
1215 *******************************************************************************/
1217 u4 u2_utflength(u2 *text, u4 u2_length)
1219 u4 result_len = 0; /* utf length in bytes */
1220 u2 ch; /* current unicode character */
1223 for (len = 0; len < u2_length; len++) {
1224 /* next unicode character */
1227 /* determine bytes required to store unicode character as utf */
1228 if (ch && (ch < 0x80))
1230 else if (ch < 0x800)
1240 /* utf_copy ********************************************************************
1242 Copy the given utf string byte-for-byte to a buffer.
1245 buffer.......the buffer
1246 u............the utf string
1248 *******************************************************************************/
1250 void utf_copy(char *buffer, utf *u)
1252 /* our utf strings are zero-terminated (done by utf_new) */
1253 MCOPY(buffer, u->text, char, u->blength + 1);
1257 /* utf_cat *********************************************************************
1259 Append the given utf string byte-for-byte to a buffer.
1262 buffer.......the buffer
1263 u............the utf string
1265 *******************************************************************************/
1267 void utf_cat(char *buffer, utf *u)
1269 /* our utf strings are zero-terminated (done by utf_new) */
1270 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1274 /* utf_copy_classname **********************************************************
1276 Copy the given utf classname byte-for-byte to a buffer.
1277 '/' is replaced by '.'
1280 buffer.......the buffer
1281 u............the utf string
1283 *******************************************************************************/
1285 void utf_copy_classname(char *buffer, utf *u)
1294 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1296 while (srcptr != endptr) {
1305 /* utf_cat *********************************************************************
1307 Append the given utf classname byte-for-byte to a buffer.
1308 '/' is replaced by '.'
1311 buffer.......the buffer
1312 u............the utf string
1314 *******************************************************************************/
1316 void utf_cat_classname(char *buffer, utf *u)
1318 utf_copy_classname(buffer + strlen(buffer), u);
1321 /* utf_display_printable_ascii *************************************************
1323 Write utf symbol to stdout (for debugging purposes).
1324 Non-printable and non-ASCII characters are printed as '?'.
1326 *******************************************************************************/
1328 void utf_display_printable_ascii(utf *u)
1330 char *endpos; /* points behind utf string */
1331 char *utf_ptr; /* current position in utf text */
1339 endpos = UTF_END(u);
1342 while (utf_ptr < endpos) {
1343 /* read next unicode character */
1345 u2 c = utf_nextu2(&utf_ptr);
1347 if ((c >= 32) && (c <= 127))
1357 /* utf_display_printable_ascii_classname ***************************************
1359 Write utf symbol to stdout with `/' converted to `.' (for debugging
1361 Non-printable and non-ASCII characters are printed as '?'.
1363 *******************************************************************************/
1365 void utf_display_printable_ascii_classname(utf *u)
1367 char *endpos; /* points behind utf string */
1368 char *utf_ptr; /* current position in utf text */
1376 endpos = UTF_END(u);
1379 while (utf_ptr < endpos) {
1380 /* read next unicode character */
1382 u2 c = utf_nextu2(&utf_ptr);
1387 if ((c >= 32) && (c <= 127))
1397 /* utf_sprint_convert_to_latin1 ************************************************
1399 Write utf symbol into c-string (for debugging purposes).
1400 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1403 *******************************************************************************/
1405 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1407 char *endpos; /* points behind utf string */
1408 char *utf_ptr; /* current position in utf text */
1409 u2 pos = 0; /* position in c-string */
1412 strcpy(buffer, "NULL");
1416 endpos = UTF_END(u);
1419 while (utf_ptr < endpos)
1420 /* copy next unicode character */
1421 buffer[pos++] = utf_nextu2(&utf_ptr);
1423 /* terminate string */
1428 /* utf_sprint_convert_to_latin1_classname **************************************
1430 Write utf symbol into c-string with `/' converted to `.' (for debugging
1432 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1435 *******************************************************************************/
1437 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1439 char *endpos; /* points behind utf string */
1440 char *utf_ptr; /* current position in utf text */
1441 u2 pos = 0; /* position in c-string */
1444 strcpy(buffer, "NULL");
1448 endpos = UTF_END(u);
1451 while (utf_ptr < endpos) {
1452 /* copy next unicode character */
1453 u2 c = utf_nextu2(&utf_ptr);
1454 if (c == '/') c = '.';
1458 /* terminate string */
1463 /* utf_strcat_convert_to_latin1 ************************************************
1465 Like libc strcat, but uses an utf8 string.
1466 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1469 *******************************************************************************/
1471 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1473 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1477 /* utf_strcat_convert_to_latin1_classname **************************************
1479 Like libc strcat, but uses an utf8 string.
1480 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1483 *******************************************************************************/
1485 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1487 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1491 /* utf_fprint_printable_ascii **************************************************
1493 Write utf symbol into file.
1494 Non-printable and non-ASCII characters are printed as '?'.
1496 *******************************************************************************/
1498 void utf_fprint_printable_ascii(FILE *file, utf *u)
1500 char *endpos; /* points behind utf string */
1501 char *utf_ptr; /* current position in utf text */
1506 endpos = UTF_END(u);
1509 while (utf_ptr < endpos) {
1510 /* read next unicode character */
1511 u2 c = utf_nextu2(&utf_ptr);
1513 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1514 else fprintf(file, "?");
1519 /* utf_fprint_printable_ascii_classname ****************************************
1521 Write utf symbol into file with `/' converted to `.'.
1522 Non-printable and non-ASCII characters are printed as '?'.
1524 *******************************************************************************/
1526 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1528 char *endpos; /* points behind utf string */
1529 char *utf_ptr; /* current position in utf text */
1534 endpos = UTF_END(u);
1537 while (utf_ptr < endpos) {
1538 /* read next unicode character */
1539 u2 c = utf_nextu2(&utf_ptr);
1540 if (c == '/') c = '.';
1542 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1543 else fprintf(file, "?");
1548 /* is_valid_utf ****************************************************************
1550 Return true if the given string is a valid UTF-8 string.
1552 utf_ptr...points to first character
1553 end_pos...points after last character
1555 *******************************************************************************/
1557 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1559 bool is_valid_utf(char *utf_ptr, char *end_pos)
1566 if (end_pos < utf_ptr) return false;
1567 bytes = end_pos - utf_ptr;
1571 if (!c) return false; /* 0x00 is not allowed */
1572 if ((c & 0x80) == 0) continue; /* ASCII */
1574 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1575 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1576 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1577 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1578 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1579 else return false; /* invalid leading byte */
1581 if (len > 2) return false; /* Java limitation */
1583 v = (unsigned long)c & (0x3f >> len);
1585 if ((bytes -= len) < 0) return false; /* missing bytes */
1587 for (i = len; i--; ) {
1589 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1591 v = (v << 6) | (c & 0x3f);
1595 if (len != 1) return false; /* Java special */
1598 /* Sun Java seems to allow overlong UTF-8 encodings */
1600 /* if (v < min_codepoint[len]) */
1601 /* XXX throw exception? */
1604 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1605 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1607 /* even these seem to be allowed */
1608 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1615 /* is_valid_name ***************************************************************
1617 Return true if the given string may be used as a class/field/method
1618 name. (Currently this only disallows empty strings and control
1621 NOTE: The string is assumed to have passed is_valid_utf!
1623 utf_ptr...points to first character
1624 end_pos...points after last character
1626 *******************************************************************************/
1628 bool is_valid_name(char *utf_ptr, char *end_pos)
1630 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1632 while (utf_ptr < end_pos) {
1633 unsigned char c = *utf_ptr++;
1635 if (c < 0x20) return false; /* disallow control characters */
1636 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1643 bool is_valid_name_utf(utf *u)
1645 return is_valid_name(u->text, UTF_END(u));
1649 /* utf_show ********************************************************************
1651 Writes the utf symbols in the utfhash to stdout and displays the
1652 number of external hash chains grouped according to the chainlength
1653 (for debugging purposes).
1655 *******************************************************************************/
1657 #if !defined(NDEBUG)
1661 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1663 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1664 u4 max_chainlength = 0; /* maximum length of the chains */
1665 u4 sum_chainlength = 0; /* sum of the chainlengths */
1666 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1669 printf("UTF-HASH:\n");
1671 /* show element of utf-hashtable */
1673 for (i = 0; i < hashtable_utf->size; i++) {
1674 utf *u = hashtable_utf->ptr[i];
1677 printf("SLOT %d: ", (int) i);
1681 utf_display_printable_ascii(u);
1689 printf("UTF-HASH: %d slots for %d entries\n",
1690 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1692 if (hashtable_utf->entries == 0)
1695 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1697 for (i=0;i<CHAIN_LIMIT;i++)
1700 /* count numbers of hashchains according to their length */
1701 for (i=0; i<hashtable_utf->size; i++) {
1703 utf *u = (utf*) hashtable_utf->ptr[i];
1704 u4 chain_length = 0;
1706 /* determine chainlength */
1712 /* update sum of all chainlengths */
1713 sum_chainlength+=chain_length;
1715 /* determine the maximum length of the chains */
1716 if (chain_length>max_chainlength)
1717 max_chainlength = chain_length;
1719 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1720 if (chain_length>=CHAIN_LIMIT) {
1721 beyond_limit+=chain_length;
1722 chain_length=CHAIN_LIMIT-1;
1725 /* update number of hashchains of current length */
1726 chain_count[chain_length]++;
1729 /* display results */
1730 for (i=1;i<CHAIN_LIMIT-1;i++)
1731 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1733 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1736 printf("max. chainlength:%5d\n",max_chainlength);
1738 /* avg. chainlength = sum of chainlengths / number of chains */
1739 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1741 #endif /* !defined(NDEBUG) */
1745 * These are local overrides for various environment variables in Emacs.
1746 * Please do not remove this and leave it at the end of the file, where
1747 * Emacs will automagically detect them.
1748 * ---------------------------------------------------------------------
1751 * indent-tabs-mode: t
1755 * vim:noexpandtab:sw=4:ts=4: