1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
34 $Id: utf8.c 5088 2006-07-08 20:16:05Z twisti $
46 #include "mm/memory.h"
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/threads.h"
52 #include "vm/builtin.h"
53 #include "vm/exceptions.h"
54 #include "vm/hashtable.h"
55 #include "vm/options.h"
56 #include "vm/statistics.h"
57 #include "vm/stringlocal.h"
61 /* global variables ***********************************************************/
63 /* hashsize must be power of 2 */
65 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
67 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
70 /* utf-symbols for pointer comparison of frequently used strings **************/
72 utf *utf_java_lang_Object;
74 utf *utf_java_lang_Class;
75 utf *utf_java_lang_ClassLoader;
76 utf *utf_java_lang_Cloneable;
77 utf *utf_java_lang_SecurityManager;
78 utf *utf_java_lang_String;
79 utf *utf_java_lang_System;
80 utf *utf_java_lang_ThreadGroup;
81 utf *utf_java_io_Serializable;
83 utf *utf_java_lang_Throwable;
84 utf *utf_java_lang_VMThrowable;
85 utf *utf_java_lang_Error;
86 utf *utf_java_lang_AbstractMethodError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoClassDefFoundError;
89 utf *utf_java_lang_NoSuchMethodError;
90 utf *utf_java_lang_OutOfMemoryError;
92 utf *utf_java_lang_Exception;
93 utf *utf_java_lang_ClassCastException;
94 utf *utf_java_lang_ClassNotFoundException;
95 utf *utf_java_lang_IllegalArgumentException;
96 utf *utf_java_lang_IllegalMonitorStateException;
98 utf *utf_java_lang_NullPointerException;
100 utf* utf_java_lang_Void;
101 utf* utf_java_lang_Boolean;
102 utf* utf_java_lang_Byte;
103 utf* utf_java_lang_Character;
104 utf* utf_java_lang_Short;
105 utf* utf_java_lang_Integer;
106 utf* utf_java_lang_Long;
107 utf* utf_java_lang_Float;
108 utf* utf_java_lang_Double;
110 utf *utf_java_lang_StackTraceElement;
111 utf *utf_java_lang_reflect_Constructor;
112 utf *utf_java_lang_reflect_Field;
113 utf *utf_java_lang_reflect_Method;
114 utf *utf_java_util_Vector;
116 utf *utf_InnerClasses; /* InnerClasses */
117 utf *utf_ConstantValue; /* ConstantValue */
118 utf *utf_Code; /* Code */
119 utf *utf_Exceptions; /* Exceptions */
120 utf *utf_LineNumberTable; /* LineNumberTable */
121 utf *utf_SourceFile; /* SourceFile */
123 utf *utf_init; /* <init> */
124 utf *utf_clinit; /* <clinit> */
125 utf *utf_clone; /* clone */
126 utf *utf_finalize; /* finalize */
127 utf *utf_run; /* run */
129 utf *utf_add; /* add */
130 utf *utf_remove; /* remove */
131 utf *utf_put; /* put */
132 utf *utf_get; /* get */
133 utf *utf_value; /* value */
135 utf *utf_fillInStackTrace;
136 utf *utf_getSystemClassLoader;
138 utf *utf_printStackTrace;
149 utf *utf_void__void; /* ()V */
150 utf *utf_boolean__void; /* (Z)V */
151 utf *utf_byte__void; /* (B)V */
152 utf *utf_char__void; /* (C)V */
153 utf *utf_short__void; /* (S)V */
154 utf *utf_int__void; /* (I)V */
155 utf *utf_long__void; /* (J)V */
156 utf *utf_float__void; /* (F)V */
157 utf *utf_double__void; /* (D)V */
159 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
160 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
161 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
162 utf *utf_java_lang_Object__java_lang_Object;
163 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
164 utf *utf_java_lang_String__java_lang_Class;
165 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
167 utf *utf_not_named_yet; /* special name for unnamed classes */
169 utf *array_packagename;
172 /* utf_init ********************************************************************
174 Initializes the utf8 subsystem.
176 *******************************************************************************/
180 /* create utf8 hashtable */
182 hashtable_utf = NEW(hashtable);
184 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
186 #if defined(ENABLE_STATISTICS)
188 count_utf_len += sizeof(utf*) * hashtable_utf->size;
191 /* create utf-symbols for pointer comparison of frequently used strings */
193 utf_java_lang_Object = utf_new_char("java/lang/Object");
195 utf_java_lang_Class = utf_new_char("java/lang/Class");
196 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
197 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
198 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
199 utf_java_lang_String = utf_new_char("java/lang/String");
200 utf_java_lang_System = utf_new_char("java/lang/System");
201 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
202 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
204 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
205 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
206 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
208 utf_java_lang_AbstractMethodError =
209 utf_new_char(string_java_lang_AbstractMethodError);
211 utf_java_lang_LinkageError =
212 utf_new_char(string_java_lang_LinkageError);
214 utf_java_lang_NoClassDefFoundError =
215 utf_new_char(string_java_lang_NoClassDefFoundError);
217 utf_java_lang_NoSuchMethodError =
218 utf_new_char(string_java_lang_NoSuchMethodError);
220 utf_java_lang_OutOfMemoryError =
221 utf_new_char(string_java_lang_OutOfMemoryError);
223 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
225 utf_java_lang_ClassCastException =
226 utf_new_char(string_java_lang_ClassCastException);
228 utf_java_lang_ClassNotFoundException =
229 utf_new_char(string_java_lang_ClassNotFoundException);
231 utf_java_lang_IllegalArgumentException =
232 utf_new_char(string_java_lang_IllegalArgumentException);
234 utf_java_lang_IllegalMonitorStateException =
235 utf_new_char(string_java_lang_IllegalMonitorStateException);
237 utf_java_lang_NullPointerException =
238 utf_new_char(string_java_lang_NullPointerException);
240 utf_java_lang_Void = utf_new_char("java/lang/Void");
241 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
242 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
243 utf_java_lang_Character = utf_new_char("java/lang/Character");
244 utf_java_lang_Short = utf_new_char("java/lang/Short");
245 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
246 utf_java_lang_Long = utf_new_char("java/lang/Long");
247 utf_java_lang_Float = utf_new_char("java/lang/Float");
248 utf_java_lang_Double = utf_new_char("java/lang/Double");
250 utf_java_lang_StackTraceElement =
251 utf_new_char("java/lang/StackTraceElement");
253 utf_java_lang_reflect_Constructor =
254 utf_new_char("java/lang/reflect/Constructor");
256 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
257 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
258 utf_java_util_Vector = utf_new_char("java/util/Vector");
260 utf_InnerClasses = utf_new_char("InnerClasses");
261 utf_ConstantValue = utf_new_char("ConstantValue");
262 utf_Code = utf_new_char("Code");
263 utf_Exceptions = utf_new_char("Exceptions");
264 utf_LineNumberTable = utf_new_char("LineNumberTable");
265 utf_SourceFile = utf_new_char("SourceFile");
267 utf_init = utf_new_char("<init>");
268 utf_clinit = utf_new_char("<clinit>");
269 utf_clone = utf_new_char("clone");
270 utf_finalize = utf_new_char("finalize");
271 utf_run = utf_new_char("run");
273 utf_add = utf_new_char("add");
274 utf_remove = utf_new_char("remove");
275 utf_put = utf_new_char("put");
276 utf_get = utf_new_char("get");
277 utf_value = utf_new_char("value");
279 utf_printStackTrace = utf_new_char("printStackTrace");
280 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
281 utf_loadClass = utf_new_char("loadClass");
282 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
284 utf_Z = utf_new_char("Z");
285 utf_B = utf_new_char("B");
286 utf_C = utf_new_char("C");
287 utf_S = utf_new_char("S");
288 utf_I = utf_new_char("I");
289 utf_J = utf_new_char("J");
290 utf_F = utf_new_char("F");
291 utf_D = utf_new_char("D");
293 utf_void__void = utf_new_char("()V");
294 utf_boolean__void = utf_new_char("(Z)V");
295 utf_byte__void = utf_new_char("(B)V");
296 utf_char__void = utf_new_char("(C)V");
297 utf_short__void = utf_new_char("(S)V");
298 utf_int__void = utf_new_char("(I)V");
299 utf_long__void = utf_new_char("(J)V");
300 utf_float__void = utf_new_char("(F)V");
301 utf_double__void = utf_new_char("(D)V");
302 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
303 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
305 utf_void__java_lang_ClassLoader =
306 utf_new_char("()Ljava/lang/ClassLoader;");
308 utf_java_lang_Object__java_lang_Object =
309 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
311 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
313 utf_java_lang_String__java_lang_Class =
314 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
316 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
318 utf_null = utf_new_char("null");
319 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
320 array_packagename = utf_new_char("\t<the array package>");
322 /* everything's ok */
328 /* utf_hashkey *****************************************************************
330 The hashkey is computed from the utf-text by using up to 8
331 characters. For utf-symbols longer than 15 characters 3 characters
332 are taken from the beginning and the end, 2 characters are taken
335 *******************************************************************************/
337 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
338 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
340 u4 utf_hashkey(const char *text, u4 length)
342 const char *start_pos = text; /* pointer to utf text */
346 case 0: /* empty string */
349 case 1: return fbs(0);
350 case 2: return fbs(0) ^ nbs(3);
351 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
352 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
353 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
354 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
355 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
356 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
363 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
372 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
381 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
393 return a ^ nbs(9) ^ nbs(10);
405 return a ^ nbs(9) ^ nbs(10);
416 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
427 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
429 default: /* 3 characters from beginning */
435 /* 2 characters from middle */
436 text = start_pos + (length / 2);
441 /* 3 characters from end */
442 text = start_pos + length - 4;
447 return a ^ nbs(10) ^ nbs(11);
451 /* utf_full_hashkey ************************************************************
453 This function computes a hash value using all bytes in the string.
455 The algorithm is the "One-at-a-time" algorithm as published
456 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
458 *******************************************************************************/
460 u4 utf_full_hashkey(const char *text, u4 length)
462 register const unsigned char *p = (const unsigned char *) text;
470 hash += (hash << 10);
474 hash ^= (hash >> 11);
475 hash += (hash << 15);
480 /* unicode_hashkey *************************************************************
482 Compute the hashkey of a unicode string.
484 *******************************************************************************/
486 u4 unicode_hashkey(u2 *text, u2 len)
488 return utf_hashkey((char *) text, len);
492 /* utf_new *********************************************************************
494 Creates a new utf-symbol, the text of the symbol is passed as a
495 u1-array. The function searches the utf-hashtable for a utf-symbol
496 with this text. On success the element returned, otherwise a new
497 hashtable element is created.
499 If the number of entries in the hashtable exceeds twice the size of
500 the hashtable slots a reorganization of the hashtable is done and
501 the utf symbols are copied to a new hashtable with doubled size.
503 *******************************************************************************/
505 utf *utf_new(const char *text, u2 length)
507 u4 key; /* hashkey computed from utf-text */
508 u4 slot; /* slot in hashtable */
509 utf *u; /* hashtable element */
512 #if defined(ENABLE_THREADS)
513 builtin_monitorenter(hashtable_utf->header);
516 #if defined(ENABLE_STATISTICS)
521 key = utf_hashkey(text, length);
522 slot = key & (hashtable_utf->size - 1);
523 u = hashtable_utf->ptr[slot];
525 /* search external hash chain for utf-symbol */
528 if (u->blength == length) {
529 /* compare text of hashtable elements */
531 for (i = 0; i < length; i++)
532 if (text[i] != u->text[i])
535 #if defined(ENABLE_STATISTICS)
537 count_utf_new_found++;
540 /* symbol found in hashtable */
542 #if defined(ENABLE_THREADS)
543 builtin_monitorexit(hashtable_utf->header);
550 u = u->hashlink; /* next element in external chain */
553 #if defined(ENABLE_STATISTICS)
555 count_utf_len += sizeof(utf) + length + 1;
558 /* location in hashtable found, create new utf element */
560 u->blength = length; /* length in bytes of utfstring */
561 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
562 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
564 memcpy(u->text, text, length); /* copy utf-text */
565 u->text[length] = '\0';
567 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
568 hashtable_utf->entries++; /* update number of entries */
570 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
572 /* reorganization of hashtable, average length of the external
573 chains is approx. 2 */
575 hashtable *newhash; /* the new hashtable */
581 /* create new hashtable, double the size */
583 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
585 #if defined(ENABLE_STATISTICS)
587 count_utf_len += sizeof(utf*) * hashtable_utf->size;
590 /* transfer elements to new hashtable */
592 for (i = 0; i < hashtable_utf->size; i++) {
593 u = hashtable_utf->ptr[i];
597 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
599 u->hashlink = (utf *) newhash->ptr[slot];
600 newhash->ptr[slot] = u;
602 /* follow link in external hash chain */
608 /* dispose old table */
610 hashtable_free(hashtable_utf);
612 hashtable_utf = newhash;
615 #if defined(ENABLE_THREADS)
616 builtin_monitorexit(hashtable_utf->header);
623 /* utf_new_u2 ******************************************************************
625 Make utf symbol from u2 array, if isclassname is true '.' is
628 *******************************************************************************/
630 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
632 char *buffer; /* memory buffer for unicode characters */
633 char *pos; /* pointer to current position in buffer */
634 u4 left; /* unicode characters left */
635 u4 buflength; /* utf length in bytes of the u2 array */
636 utf *result; /* resulting utf-string */
639 /* determine utf length in bytes and allocate memory */
641 buflength = u2_utflength(unicode_pos, unicode_length);
642 buffer = MNEW(char, buflength);
647 for (i = 0; i++ < unicode_length; unicode_pos++) {
648 /* next unicode character */
651 if ((c != 0) && (c < 0x80)) {
654 if ((int) left < 0) break;
655 /* convert classname */
656 if (isclassname && c == '.')
661 } else if (c < 0x800) {
663 unsigned char high = c >> 6;
664 unsigned char low = c & 0x3F;
666 if ((int) left < 0) break;
667 *pos++ = high | 0xC0;
673 char mid = (c >> 6) & 0x3F;
676 if ((int) left < 0) break;
677 *pos++ = high | 0xE0;
683 /* insert utf-string into symbol-table */
684 result = utf_new(buffer,buflength);
686 MFREE(buffer, char, buflength);
692 /* utf_new_char ****************************************************************
694 Creates a new utf symbol, the text for this symbol is passed as a
695 c-string ( = char* ).
697 *******************************************************************************/
699 utf *utf_new_char(const char *text)
701 return utf_new(text, strlen(text));
705 /* utf_new_char_classname ******************************************************
707 Creates a new utf symbol, the text for this symbol is passed as a
708 c-string ( = char* ) "." characters are going to be replaced by
709 "/". Since the above function is used often, this is a separte
710 function, instead of an if.
712 *******************************************************************************/
714 utf *utf_new_char_classname(const char *text)
716 if (strchr(text, '.')) {
717 char *txt = strdup(text);
718 char *end = txt + strlen(txt);
722 for (c = txt; c < end; c++)
723 if (*c == '.') *c = '/';
725 tmpRes = utf_new(txt, strlen(txt));
731 return utf_new(text, strlen(text));
735 /* utf_nextu2 ******************************************************************
737 Read the next unicode character from the utf string and increment
738 the utf-string pointer accordingly.
740 *******************************************************************************/
742 u2 utf_nextu2(char **utf_ptr)
744 /* uncompressed unicode character */
746 /* current position in utf text */
747 unsigned char *utf = (unsigned char *) (*utf_ptr);
748 /* bytes representing the unicode character */
749 unsigned char ch1, ch2, ch3;
750 /* number of bytes used to represent the unicode character */
753 switch ((ch1 = utf[0]) >> 4) {
754 default: /* 1 byte */
758 case 0xD: /* 2 bytes */
759 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
760 unsigned char high = ch1 & 0x1F;
761 unsigned char low = ch2 & 0x3F;
762 unicode_char = (high << 6) + low;
767 case 0xE: /* 2 or 3 bytes */
768 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
769 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
770 unsigned char low = ch3 & 0x3f;
771 unsigned char mid = ch2 & 0x3f;
772 unsigned char high = ch1 & 0x0f;
773 unicode_char = (((high << 6) + mid) << 6) + low;
781 /* update position in utf-text */
782 *utf_ptr = (char *) (utf + len);
788 /* utf_bytes *******************************************************************
790 Determine number of bytes (aka. octets) in the utf string.
793 u............utf string
796 The number of octets of this utf string.
797 There is _no_ terminating zero included in this count.
799 *******************************************************************************/
806 /* utf_get_number_of_u2s_for_buffer ********************************************
808 Determine number of UTF-16 u2s in the given UTF-8 buffer
810 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
811 to an array of u2s (UTF-16) and want to know how many of them you will get.
812 All other uses of this function are probably wrong.
815 buffer........points to first char in buffer
816 blength.......number of _bytes_ in the buffer
819 the number of u2s needed to hold this string in UTF-16 encoding.
820 There is _no_ terminating zero included in this count.
822 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
825 *******************************************************************************/
827 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
829 const char *endpos; /* points behind utf string */
830 const char *utf_ptr; /* current position in utf text */
831 u4 len = 0; /* number of unicode characters */
834 endpos = utf_ptr + blength;
836 while (utf_ptr < endpos) {
838 /* next unicode character */
839 utf_nextu2((char **)&utf_ptr);
842 assert(utf_ptr == endpos);
848 /* utf_get_number_of_u2s *******************************************************
850 Determine number of UTF-16 u2s in the utf string.
852 CAUTION: Use this function *only* when you want to convert a utf string
853 to an array of u2s and want to know how many of them you will get.
854 All other uses of this function are probably wrong.
857 u............utf string
860 the number of u2s needed to hold this string in UTF-16 encoding.
861 There is _no_ terminating zero included in this count.
862 XXX 0 if a NullPointerException has been thrown (see below)
864 *******************************************************************************/
866 u4 utf_get_number_of_u2s(utf *u)
868 char *endpos; /* points behind utf string */
869 char *utf_ptr; /* current position in utf text */
870 u4 len = 0; /* number of unicode characters */
872 /* XXX this is probably not checked by most callers! Review this after */
873 /* the invalid uses of this function have been eliminated */
875 exceptions_throw_nullpointerexception();
882 while (utf_ptr < endpos) {
884 /* next unicode character */
885 utf_nextu2(&utf_ptr);
888 if (utf_ptr != endpos)
889 /* string ended abruptly */
890 throw_cacao_exception_exit(string_java_lang_InternalError,
891 "Illegal utf8 string");
897 /* u2_utflength ****************************************************************
899 Returns the utf length in bytes of a u2 array.
901 *******************************************************************************/
903 u4 u2_utflength(u2 *text, u4 u2_length)
905 u4 result_len = 0; /* utf length in bytes */
906 u2 ch; /* current unicode character */
909 for (len = 0; len < u2_length; len++) {
910 /* next unicode character */
913 /* determine bytes required to store unicode character as utf */
914 if (ch && (ch < 0x80))
926 /* utf_copy ********************************************************************
928 Copy the given utf string byte-for-byte to a buffer.
931 buffer.......the buffer
932 u............the utf string
934 *******************************************************************************/
936 void utf_copy(char *buffer, utf *u)
938 /* our utf strings are zero-terminated (done by utf_new) */
939 MCOPY(buffer, u->text, char, u->blength + 1);
943 /* utf_cat *********************************************************************
945 Append the given utf string byte-for-byte to a buffer.
948 buffer.......the buffer
949 u............the utf string
951 *******************************************************************************/
953 void utf_cat(char *buffer, utf *u)
955 /* our utf strings are zero-terminated (done by utf_new) */
956 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
960 /* utf_copy_classname **********************************************************
962 Copy the given utf classname byte-for-byte to a buffer.
963 '/' is replaced by '.'
966 buffer.......the buffer
967 u............the utf string
969 *******************************************************************************/
971 void utf_copy_classname(char *buffer, utf *u)
980 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
982 while (srcptr != endptr) {
991 /* utf_cat *********************************************************************
993 Append the given utf classname byte-for-byte to a buffer.
994 '/' is replaced by '.'
997 buffer.......the buffer
998 u............the utf string
1000 *******************************************************************************/
1002 void utf_cat_classname(char *buffer, utf *u)
1004 utf_copy_classname(buffer + strlen(buffer), u);
1007 /* utf_display_printable_ascii *************************************************
1009 Write utf symbol to stdout (for debugging purposes).
1010 Non-printable and non-ASCII characters are printed as '?'.
1012 *******************************************************************************/
1014 void utf_display_printable_ascii(utf *u)
1016 char *endpos; /* points behind utf string */
1017 char *utf_ptr; /* current position in utf text */
1025 endpos = UTF_END(u);
1028 while (utf_ptr < endpos) {
1029 /* read next unicode character */
1031 u2 c = utf_nextu2(&utf_ptr);
1033 if ((c >= 32) && (c <= 127))
1043 /* utf_display_printable_ascii_classname ***************************************
1045 Write utf symbol to stdout with `/' converted to `.' (for debugging
1047 Non-printable and non-ASCII characters are printed as '?'.
1049 *******************************************************************************/
1051 void utf_display_printable_ascii_classname(utf *u)
1053 char *endpos; /* points behind utf string */
1054 char *utf_ptr; /* current position in utf text */
1062 endpos = UTF_END(u);
1065 while (utf_ptr < endpos) {
1066 /* read next unicode character */
1068 u2 c = utf_nextu2(&utf_ptr);
1073 if ((c >= 32) && (c <= 127))
1083 /* utf_sprint_convert_to_latin1 ************************************************
1085 Write utf symbol into c-string (for debugging purposes).
1086 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1089 *******************************************************************************/
1091 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1093 char *endpos; /* points behind utf string */
1094 char *utf_ptr; /* current position in utf text */
1095 u2 pos = 0; /* position in c-string */
1098 strcpy(buffer, "NULL");
1102 endpos = UTF_END(u);
1105 while (utf_ptr < endpos)
1106 /* copy next unicode character */
1107 buffer[pos++] = utf_nextu2(&utf_ptr);
1109 /* terminate string */
1114 /* utf_sprint_convert_to_latin1_classname **************************************
1116 Write utf symbol into c-string with `/' converted to `.' (for debugging
1118 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1121 *******************************************************************************/
1123 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1125 char *endpos; /* points behind utf string */
1126 char *utf_ptr; /* current position in utf text */
1127 u2 pos = 0; /* position in c-string */
1130 strcpy(buffer, "NULL");
1134 endpos = UTF_END(u);
1137 while (utf_ptr < endpos) {
1138 /* copy next unicode character */
1139 u2 c = utf_nextu2(&utf_ptr);
1140 if (c == '/') c = '.';
1144 /* terminate string */
1149 /* utf_strcat_convert_to_latin1 ************************************************
1151 Like libc strcat, but uses an utf8 string.
1152 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1155 *******************************************************************************/
1157 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1159 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1163 /* utf_strcat_convert_to_latin1_classname **************************************
1165 Like libc strcat, but uses an utf8 string.
1166 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1169 *******************************************************************************/
1171 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1173 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1177 /* utf_fprint_printable_ascii **************************************************
1179 Write utf symbol into file.
1180 Non-printable and non-ASCII characters are printed as '?'.
1182 *******************************************************************************/
1184 void utf_fprint_printable_ascii(FILE *file, utf *u)
1186 char *endpos; /* points behind utf string */
1187 char *utf_ptr; /* current position in utf text */
1192 endpos = UTF_END(u);
1195 while (utf_ptr < endpos) {
1196 /* read next unicode character */
1197 u2 c = utf_nextu2(&utf_ptr);
1199 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1200 else fprintf(file, "?");
1205 /* utf_fprint_printable_ascii_classname ****************************************
1207 Write utf symbol into file with `/' converted to `.'.
1208 Non-printable and non-ASCII characters are printed as '?'.
1210 *******************************************************************************/
1212 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1214 char *endpos; /* points behind utf string */
1215 char *utf_ptr; /* current position in utf text */
1220 endpos = UTF_END(u);
1223 while (utf_ptr < endpos) {
1224 /* read next unicode character */
1225 u2 c = utf_nextu2(&utf_ptr);
1226 if (c == '/') c = '.';
1228 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1229 else fprintf(file, "?");
1234 /* is_valid_utf ****************************************************************
1236 Return true if the given string is a valid UTF-8 string.
1238 utf_ptr...points to first character
1239 end_pos...points after last character
1241 *******************************************************************************/
1243 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1245 bool is_valid_utf(char *utf_ptr, char *end_pos)
1252 if (end_pos < utf_ptr) return false;
1253 bytes = end_pos - utf_ptr;
1257 if (!c) return false; /* 0x00 is not allowed */
1258 if ((c & 0x80) == 0) continue; /* ASCII */
1260 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1261 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1262 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1263 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1264 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1265 else return false; /* invalid leading byte */
1267 if (len > 2) return false; /* Java limitation */
1269 v = (unsigned long)c & (0x3f >> len);
1271 if ((bytes -= len) < 0) return false; /* missing bytes */
1273 for (i = len; i--; ) {
1275 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1277 v = (v << 6) | (c & 0x3f);
1281 if (len != 1) return false; /* Java special */
1284 /* Sun Java seems to allow overlong UTF-8 encodings */
1286 /* if (v < min_codepoint[len]) */
1287 /* XXX throw exception? */
1290 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1291 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1293 /* even these seem to be allowed */
1294 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1301 /* is_valid_name ***************************************************************
1303 Return true if the given string may be used as a class/field/method
1304 name. (Currently this only disallows empty strings and control
1307 NOTE: The string is assumed to have passed is_valid_utf!
1309 utf_ptr...points to first character
1310 end_pos...points after last character
1312 *******************************************************************************/
1314 bool is_valid_name(char *utf_ptr, char *end_pos)
1316 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1318 while (utf_ptr < end_pos) {
1319 unsigned char c = *utf_ptr++;
1321 if (c < 0x20) return false; /* disallow control characters */
1322 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1329 bool is_valid_name_utf(utf *u)
1331 return is_valid_name(u->text, UTF_END(u));
1335 /* utf_show ********************************************************************
1337 Writes the utf symbols in the utfhash to stdout and displays the
1338 number of external hash chains grouped according to the chainlength
1339 (for debugging purposes).
1341 *******************************************************************************/
1343 #if !defined(NDEBUG)
1347 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1349 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1350 u4 max_chainlength = 0; /* maximum length of the chains */
1351 u4 sum_chainlength = 0; /* sum of the chainlengths */
1352 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1355 printf("UTF-HASH:\n");
1357 /* show element of utf-hashtable */
1359 for (i = 0; i < hashtable_utf->size; i++) {
1360 utf *u = hashtable_utf->ptr[i];
1363 printf("SLOT %d: ", (int) i);
1367 utf_display_printable_ascii(u);
1375 printf("UTF-HASH: %d slots for %d entries\n",
1376 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1378 if (hashtable_utf->entries == 0)
1381 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1383 for (i=0;i<CHAIN_LIMIT;i++)
1386 /* count numbers of hashchains according to their length */
1387 for (i=0; i<hashtable_utf->size; i++) {
1389 utf *u = (utf*) hashtable_utf->ptr[i];
1390 u4 chain_length = 0;
1392 /* determine chainlength */
1398 /* update sum of all chainlengths */
1399 sum_chainlength+=chain_length;
1401 /* determine the maximum length of the chains */
1402 if (chain_length>max_chainlength)
1403 max_chainlength = chain_length;
1405 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1406 if (chain_length>=CHAIN_LIMIT) {
1407 beyond_limit+=chain_length;
1408 chain_length=CHAIN_LIMIT-1;
1411 /* update number of hashchains of current length */
1412 chain_count[chain_length]++;
1415 /* display results */
1416 for (i=1;i<CHAIN_LIMIT-1;i++)
1417 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1419 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1422 printf("max. chainlength:%5d\n",max_chainlength);
1424 /* avg. chainlength = sum of chainlengths / number of chains */
1425 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1427 #endif /* !defined(NDEBUG) */
1431 * These are local overrides for various environment variables in Emacs.
1432 * Please do not remove this and leave it at the end of the file, where
1433 * Emacs will automagically detect them.
1434 * ---------------------------------------------------------------------
1437 * indent-tabs-mode: t
1441 * vim:noexpandtab:sw=4:ts=4: