1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
34 $Id: utf8.c 4921 2006-05-15 14:24:36Z twisti $
46 #include "mm/memory.h"
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/threads.h"
52 #include "vm/builtin.h"
53 #include "vm/exceptions.h"
54 #include "vm/hashtable.h"
55 #include "vm/options.h"
56 #include "vm/statistics.h"
57 #include "vm/stringlocal.h"
61 /* global variables ***********************************************************/
63 /* hashsize must be power of 2 */
65 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
67 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
70 /* utf-symbols for pointer comparison of frequently used strings **************/
72 utf *utf_java_lang_Object;
74 utf *utf_java_lang_Class;
75 utf *utf_java_lang_ClassLoader;
76 utf *utf_java_lang_Cloneable;
77 utf *utf_java_lang_SecurityManager;
78 utf *utf_java_lang_String;
79 utf *utf_java_lang_System;
80 utf *utf_java_lang_ThreadGroup;
81 utf *utf_java_io_Serializable;
83 utf *utf_java_lang_Throwable;
84 utf *utf_java_lang_VMThrowable;
85 utf *utf_java_lang_Error;
86 utf *utf_java_lang_NoClassDefFoundError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoSuchMethodError;
89 utf *utf_java_lang_OutOfMemoryError;
91 utf *utf_java_lang_Exception;
92 utf *utf_java_lang_ClassNotFoundException;
93 utf *utf_java_lang_IllegalArgumentException;
94 utf *utf_java_lang_IllegalMonitorStateException;
96 utf *utf_java_lang_NullPointerException;
98 utf* utf_java_lang_Void;
99 utf* utf_java_lang_Boolean;
100 utf* utf_java_lang_Byte;
101 utf* utf_java_lang_Character;
102 utf* utf_java_lang_Short;
103 utf* utf_java_lang_Integer;
104 utf* utf_java_lang_Long;
105 utf* utf_java_lang_Float;
106 utf* utf_java_lang_Double;
108 utf *utf_java_lang_StackTraceElement;
109 utf *utf_java_lang_reflect_Constructor;
110 utf *utf_java_lang_reflect_Field;
111 utf *utf_java_lang_reflect_Method;
112 utf *utf_java_util_Vector;
114 utf *utf_InnerClasses; /* InnerClasses */
115 utf *utf_ConstantValue; /* ConstantValue */
116 utf *utf_Code; /* Code */
117 utf *utf_Exceptions; /* Exceptions */
118 utf *utf_LineNumberTable; /* LineNumberTable */
119 utf *utf_SourceFile; /* SourceFile */
121 utf *utf_init; /* <init> */
122 utf *utf_clinit; /* <clinit> */
123 utf *utf_clone; /* clone */
124 utf *utf_finalize; /* finalize */
125 utf *utf_run; /* run */
127 utf *utf_add; /* add */
128 utf *utf_remove; /* remove */
129 utf *utf_put; /* put */
130 utf *utf_get; /* get */
131 utf *utf_value; /* value */
133 utf *utf_fillInStackTrace;
134 utf *utf_getSystemClassLoader;
136 utf *utf_printStackTrace;
147 utf *utf_void__void; /* ()V */
148 utf *utf_boolean__void; /* (Z)V */
149 utf *utf_byte__void; /* (B)V */
150 utf *utf_char__void; /* (C)V */
151 utf *utf_short__void; /* (S)V */
152 utf *utf_int__void; /* (I)V */
153 utf *utf_long__void; /* (J)V */
154 utf *utf_float__void; /* (F)V */
155 utf *utf_double__void; /* (D)V */
157 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
158 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
159 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
160 utf *utf_java_lang_Object__java_lang_Object;
161 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
162 utf *utf_java_lang_String__java_lang_Class;
163 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
165 utf *utf_not_named_yet; /* special name for unnamed classes */
167 utf *array_packagename;
170 /* utf_init ********************************************************************
172 Initializes the utf8 subsystem.
174 *******************************************************************************/
178 /* create utf8 hashtable */
180 hashtable_utf = NEW(hashtable);
182 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
184 #if defined(ENABLE_STATISTICS)
186 count_utf_len += sizeof(utf*) * hashtable_utf.size;
189 /* create utf-symbols for pointer comparison of frequently used strings */
191 utf_java_lang_Object = utf_new_char("java/lang/Object");
193 utf_java_lang_Class = utf_new_char("java/lang/Class");
194 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
195 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
196 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
197 utf_java_lang_String = utf_new_char("java/lang/String");
198 utf_java_lang_System = utf_new_char("java/lang/System");
199 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
200 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
202 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
203 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
204 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
206 utf_java_lang_NoClassDefFoundError =
207 utf_new_char(string_java_lang_NoClassDefFoundError);
209 utf_java_lang_LinkageError =
210 utf_new_char(string_java_lang_LinkageError);
212 utf_java_lang_NoSuchMethodError =
213 utf_new_char(string_java_lang_NoSuchMethodError);
215 utf_java_lang_OutOfMemoryError =
216 utf_new_char(string_java_lang_OutOfMemoryError);
218 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
220 utf_java_lang_ClassNotFoundException =
221 utf_new_char(string_java_lang_ClassNotFoundException);
223 utf_java_lang_IllegalArgumentException =
224 utf_new_char(string_java_lang_IllegalArgumentException);
226 utf_java_lang_IllegalMonitorStateException =
227 utf_new_char(string_java_lang_IllegalMonitorStateException);
229 utf_java_lang_NullPointerException =
230 utf_new_char(string_java_lang_NullPointerException);
232 utf_java_lang_Void = utf_new_char("java/lang/Void");
233 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
234 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
235 utf_java_lang_Character = utf_new_char("java/lang/Character");
236 utf_java_lang_Short = utf_new_char("java/lang/Short");
237 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
238 utf_java_lang_Long = utf_new_char("java/lang/Long");
239 utf_java_lang_Float = utf_new_char("java/lang/Float");
240 utf_java_lang_Double = utf_new_char("java/lang/Double");
242 utf_java_lang_StackTraceElement =
243 utf_new_char("java/lang/StackTraceElement");
245 utf_java_lang_reflect_Constructor =
246 utf_new_char("java/lang/reflect/Constructor");
248 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
249 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
250 utf_java_util_Vector = utf_new_char("java/util/Vector");
252 utf_InnerClasses = utf_new_char("InnerClasses");
253 utf_ConstantValue = utf_new_char("ConstantValue");
254 utf_Code = utf_new_char("Code");
255 utf_Exceptions = utf_new_char("Exceptions");
256 utf_LineNumberTable = utf_new_char("LineNumberTable");
257 utf_SourceFile = utf_new_char("SourceFile");
259 utf_init = utf_new_char("<init>");
260 utf_clinit = utf_new_char("<clinit>");
261 utf_clone = utf_new_char("clone");
262 utf_finalize = utf_new_char("finalize");
263 utf_run = utf_new_char("run");
265 utf_add = utf_new_char("add");
266 utf_remove = utf_new_char("remove");
267 utf_put = utf_new_char("put");
268 utf_get = utf_new_char("get");
269 utf_value = utf_new_char("value");
271 utf_printStackTrace = utf_new_char("printStackTrace");
272 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
273 utf_loadClass = utf_new_char("loadClass");
274 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
276 utf_Z = utf_new_char("Z");
277 utf_B = utf_new_char("B");
278 utf_C = utf_new_char("C");
279 utf_S = utf_new_char("S");
280 utf_I = utf_new_char("I");
281 utf_J = utf_new_char("J");
282 utf_F = utf_new_char("F");
283 utf_D = utf_new_char("D");
285 utf_void__void = utf_new_char("()V");
286 utf_boolean__void = utf_new_char("(Z)V");
287 utf_byte__void = utf_new_char("(B)V");
288 utf_char__void = utf_new_char("(C)V");
289 utf_short__void = utf_new_char("(S)V");
290 utf_int__void = utf_new_char("(I)V");
291 utf_long__void = utf_new_char("(J)V");
292 utf_float__void = utf_new_char("(F)V");
293 utf_double__void = utf_new_char("(D)V");
294 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
295 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
297 utf_void__java_lang_ClassLoader =
298 utf_new_char("()Ljava/lang/ClassLoader;");
300 utf_java_lang_Object__java_lang_Object =
301 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
303 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
305 utf_java_lang_String__java_lang_Class =
306 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
308 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
310 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
312 array_packagename = utf_new_char("\t<the array package>");
314 /* everything's ok */
320 /* utf_hashkey *****************************************************************
322 The hashkey is computed from the utf-text by using up to 8
323 characters. For utf-symbols longer than 15 characters 3 characters
324 are taken from the beginning and the end, 2 characters are taken
327 *******************************************************************************/
329 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
330 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
332 u4 utf_hashkey(const char *text, u4 length)
334 const char *start_pos = text; /* pointer to utf text */
338 case 0: /* empty string */
341 case 1: return fbs(0);
342 case 2: return fbs(0) ^ nbs(3);
343 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
344 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
345 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
346 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
347 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
348 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
355 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
364 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
373 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
385 return a ^ nbs(9) ^ nbs(10);
397 return a ^ nbs(9) ^ nbs(10);
408 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
419 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
421 default: /* 3 characters from beginning */
427 /* 2 characters from middle */
428 text = start_pos + (length / 2);
433 /* 3 characters from end */
434 text = start_pos + length - 4;
439 return a ^ nbs(10) ^ nbs(11);
443 /* utf_full_hashkey ************************************************************
445 This function computes a hash value using all bytes in the string.
447 The algorithm is the "One-at-a-time" algorithm as published
448 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
450 *******************************************************************************/
452 u4 utf_full_hashkey(const char *text, u4 length)
454 register const unsigned char *p = (const unsigned char *) text;
462 hash += (hash << 10);
466 hash ^= (hash >> 11);
467 hash += (hash << 15);
472 /* unicode_hashkey *************************************************************
474 Compute the hashkey of a unicode string.
476 *******************************************************************************/
478 u4 unicode_hashkey(u2 *text, u2 len)
480 return utf_hashkey((char *) text, len);
484 /* utf_new *********************************************************************
486 Creates a new utf-symbol, the text of the symbol is passed as a
487 u1-array. The function searches the utf-hashtable for a utf-symbol
488 with this text. On success the element returned, otherwise a new
489 hashtable element is created.
491 If the number of entries in the hashtable exceeds twice the size of
492 the hashtable slots a reorganization of the hashtable is done and
493 the utf symbols are copied to a new hashtable with doubled size.
495 *******************************************************************************/
497 utf *utf_new(const char *text, u2 length)
499 u4 key; /* hashkey computed from utf-text */
500 u4 slot; /* slot in hashtable */
501 utf *u; /* hashtable element */
504 #if defined(ENABLE_THREADS)
505 builtin_monitorenter(hashtable_utf->header);
508 #if defined(ENABLE_STATISTICS)
513 key = utf_hashkey(text, length);
514 slot = key & (hashtable_utf->size - 1);
515 u = hashtable_utf->ptr[slot];
517 /* search external hash chain for utf-symbol */
520 if (u->blength == length) {
521 /* compare text of hashtable elements */
523 for (i = 0; i < length; i++)
524 if (text[i] != u->text[i])
527 #if defined(ENABLE_STATISTICS)
529 count_utf_new_found++;
532 /* symbol found in hashtable */
534 #if defined(ENABLE_THREADS)
535 builtin_monitorexit(hashtable_utf->header);
542 u = u->hashlink; /* next element in external chain */
545 #if defined(ENABLE_STATISTICS)
547 count_utf_len += sizeof(utf) + length + 1;
550 /* location in hashtable found, create new utf element */
552 u->blength = length; /* length in bytes of utfstring */
553 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
554 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
556 memcpy(u->text, text, length); /* copy utf-text */
557 u->text[length] = '\0';
559 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
560 hashtable_utf->entries++; /* update number of entries */
562 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
564 /* reorganization of hashtable, average length of the external
565 chains is approx. 2 */
567 hashtable *newhash; /* the new hashtable */
573 /* create new hashtable, double the size */
575 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
577 #if defined(ENABLE_STATISTICS)
579 count_utf_len += sizeof(utf*) * hashtable_utf->size;
582 /* transfer elements to new hashtable */
584 for (i = 0; i < hashtable_utf->size; i++) {
585 u = hashtable_utf->ptr[i];
589 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
591 u->hashlink = (utf *) newhash->ptr[slot];
592 newhash->ptr[slot] = u;
594 /* follow link in external hash chain */
600 /* dispose old table */
602 hashtable_free(hashtable_utf);
604 hashtable_utf = newhash;
607 #if defined(ENABLE_THREADS)
608 builtin_monitorexit(hashtable_utf->header);
615 /* utf_new_u2 ******************************************************************
617 Make utf symbol from u2 array, if isclassname is true '.' is
620 *******************************************************************************/
622 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
624 char *buffer; /* memory buffer for unicode characters */
625 char *pos; /* pointer to current position in buffer */
626 u4 left; /* unicode characters left */
627 u4 buflength; /* utf length in bytes of the u2 array */
628 utf *result; /* resulting utf-string */
631 /* determine utf length in bytes and allocate memory */
633 buflength = u2_utflength(unicode_pos, unicode_length);
634 buffer = MNEW(char, buflength);
639 for (i = 0; i++ < unicode_length; unicode_pos++) {
640 /* next unicode character */
643 if ((c != 0) && (c < 0x80)) {
646 if ((int) left < 0) break;
647 /* convert classname */
648 if (isclassname && c == '.')
653 } else if (c < 0x800) {
655 unsigned char high = c >> 6;
656 unsigned char low = c & 0x3F;
658 if ((int) left < 0) break;
659 *pos++ = high | 0xC0;
665 char mid = (c >> 6) & 0x3F;
668 if ((int) left < 0) break;
669 *pos++ = high | 0xE0;
675 /* insert utf-string into symbol-table */
676 result = utf_new(buffer,buflength);
678 MFREE(buffer, char, buflength);
684 /* utf_new_char ****************************************************************
686 Creates a new utf symbol, the text for this symbol is passed as a
687 c-string ( = char* ).
689 *******************************************************************************/
691 utf *utf_new_char(const char *text)
693 return utf_new(text, strlen(text));
697 /* utf_new_char_classname ******************************************************
699 Creates a new utf symbol, the text for this symbol is passed as a
700 c-string ( = char* ) "." characters are going to be replaced by
701 "/". Since the above function is used often, this is a separte
702 function, instead of an if.
704 *******************************************************************************/
706 utf *utf_new_char_classname(const char *text)
708 if (strchr(text, '.')) {
709 char *txt = strdup(text);
710 char *end = txt + strlen(txt);
714 for (c = txt; c < end; c++)
715 if (*c == '.') *c = '/';
717 tmpRes = utf_new(txt, strlen(txt));
723 return utf_new(text, strlen(text));
727 /* utf_nextu2 ******************************************************************
729 Read the next unicode character from the utf string and increment
730 the utf-string pointer accordingly.
732 *******************************************************************************/
734 u2 utf_nextu2(char **utf_ptr)
736 /* uncompressed unicode character */
738 /* current position in utf text */
739 unsigned char *utf = (unsigned char *) (*utf_ptr);
740 /* bytes representing the unicode character */
741 unsigned char ch1, ch2, ch3;
742 /* number of bytes used to represent the unicode character */
745 switch ((ch1 = utf[0]) >> 4) {
746 default: /* 1 byte */
750 case 0xD: /* 2 bytes */
751 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
752 unsigned char high = ch1 & 0x1F;
753 unsigned char low = ch2 & 0x3F;
754 unicode_char = (high << 6) + low;
759 case 0xE: /* 2 or 3 bytes */
760 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
761 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
762 unsigned char low = ch3 & 0x3f;
763 unsigned char mid = ch2 & 0x3f;
764 unsigned char high = ch1 & 0x0f;
765 unicode_char = (((high << 6) + mid) << 6) + low;
773 /* update position in utf-text */
774 *utf_ptr = (char *) (utf + len);
780 /* utf_bytes *******************************************************************
782 Determine number of bytes (aka. octets) in the utf string.
785 u............utf string
788 The number of octets of this utf string.
789 There is _no_ terminating zero included in this count.
791 *******************************************************************************/
798 /* utf_get_number_of_u2s_for_buffer ********************************************
800 Determine number of UTF-16 u2s in the given UTF-8 buffer
802 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
803 to an array of u2s (UTF-16) and want to know how many of them you will get.
804 All other uses of this function are probably wrong.
807 buffer........points to first char in buffer
808 blength.......number of _bytes_ in the buffer
811 the number of u2s needed to hold this string in UTF-16 encoding.
812 There is _no_ terminating zero included in this count.
814 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
817 *******************************************************************************/
819 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
821 const char *endpos; /* points behind utf string */
822 const char *utf_ptr; /* current position in utf text */
823 u4 len = 0; /* number of unicode characters */
826 endpos = utf_ptr + blength;
828 while (utf_ptr < endpos) {
830 /* next unicode character */
831 utf_nextu2((char **)&utf_ptr);
834 assert(utf_ptr == endpos);
840 /* utf_get_number_of_u2s *******************************************************
842 Determine number of UTF-16 u2s in the utf string.
844 CAUTION: Use this function *only* when you want to convert a utf string
845 to an array of u2s and want to know how many of them you will get.
846 All other uses of this function are probably wrong.
849 u............utf string
852 the number of u2s needed to hold this string in UTF-16 encoding.
853 There is _no_ terminating zero included in this count.
854 XXX 0 if a NullPointerException has been thrown (see below)
856 *******************************************************************************/
858 u4 utf_get_number_of_u2s(utf *u)
860 char *endpos; /* points behind utf string */
861 char *utf_ptr; /* current position in utf text */
862 u4 len = 0; /* number of unicode characters */
864 /* XXX this is probably not checked by most callers! Review this after */
865 /* the invalid uses of this function have been eliminated */
867 exceptions_throw_nullpointerexception();
874 while (utf_ptr < endpos) {
876 /* next unicode character */
877 utf_nextu2(&utf_ptr);
880 if (utf_ptr != endpos)
881 /* string ended abruptly */
882 throw_cacao_exception_exit(string_java_lang_InternalError,
883 "Illegal utf8 string");
889 /* u2_utflength ****************************************************************
891 Returns the utf length in bytes of a u2 array.
893 *******************************************************************************/
895 u4 u2_utflength(u2 *text, u4 u2_length)
897 u4 result_len = 0; /* utf length in bytes */
898 u2 ch; /* current unicode character */
901 for (len = 0; len < u2_length; len++) {
902 /* next unicode character */
905 /* determine bytes required to store unicode character as utf */
906 if (ch && (ch < 0x80))
918 /* utf_copy ********************************************************************
920 Copy the given utf string byte-for-byte to a buffer.
923 buffer.......the buffer
924 u............the utf string
926 *******************************************************************************/
928 void utf_copy(char *buffer, utf *u)
930 /* our utf strings are zero-terminated (done by utf_new) */
931 MCOPY(buffer, u->text, char, u->blength + 1);
935 /* utf_cat *********************************************************************
937 Append the given utf string byte-for-byte to a buffer.
940 buffer.......the buffer
941 u............the utf string
943 *******************************************************************************/
945 void utf_cat(char *buffer, utf *u)
947 /* our utf strings are zero-terminated (done by utf_new) */
948 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
952 /* utf_copy_classname **********************************************************
954 Copy the given utf classname byte-for-byte to a buffer.
955 '/' is replaced by '.'
958 buffer.......the buffer
959 u............the utf string
961 *******************************************************************************/
963 void utf_copy_classname(char *buffer, utf *u)
972 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
974 while (srcptr != endptr) {
983 /* utf_cat *********************************************************************
985 Append the given utf classname byte-for-byte to a buffer.
986 '/' is replaced by '.'
989 buffer.......the buffer
990 u............the utf string
992 *******************************************************************************/
994 void utf_cat_classname(char *buffer, utf *u)
996 utf_copy_classname(buffer + strlen(buffer), u);
999 /* utf_display_printable_ascii *************************************************
1001 Write utf symbol to stdout (for debugging purposes).
1002 Non-printable and non-ASCII characters are printed as '?'.
1004 *******************************************************************************/
1006 void utf_display_printable_ascii(utf *u)
1008 char *endpos; /* points behind utf string */
1009 char *utf_ptr; /* current position in utf text */
1017 endpos = UTF_END(u);
1020 while (utf_ptr < endpos) {
1021 /* read next unicode character */
1023 u2 c = utf_nextu2(&utf_ptr);
1025 if ((c >= 32) && (c <= 127))
1035 /* utf_display_printable_ascii_classname ***************************************
1037 Write utf symbol to stdout with `/' converted to `.' (for debugging
1039 Non-printable and non-ASCII characters are printed as '?'.
1041 *******************************************************************************/
1043 void utf_display_printable_ascii_classname(utf *u)
1045 char *endpos; /* points behind utf string */
1046 char *utf_ptr; /* current position in utf text */
1054 endpos = UTF_END(u);
1057 while (utf_ptr < endpos) {
1058 /* read next unicode character */
1060 u2 c = utf_nextu2(&utf_ptr);
1065 if ((c >= 32) && (c <= 127))
1075 /* utf_sprint_convert_to_latin1 ************************************************
1077 Write utf symbol into c-string (for debugging purposes).
1078 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1081 *******************************************************************************/
1083 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1085 char *endpos; /* points behind utf string */
1086 char *utf_ptr; /* current position in utf text */
1087 u2 pos = 0; /* position in c-string */
1090 strcpy(buffer, "NULL");
1094 endpos = UTF_END(u);
1097 while (utf_ptr < endpos)
1098 /* copy next unicode character */
1099 buffer[pos++] = utf_nextu2(&utf_ptr);
1101 /* terminate string */
1106 /* utf_sprint_convert_to_latin1_classname **************************************
1108 Write utf symbol into c-string with `/' converted to `.' (for debugging
1110 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1113 *******************************************************************************/
1115 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1117 char *endpos; /* points behind utf string */
1118 char *utf_ptr; /* current position in utf text */
1119 u2 pos = 0; /* position in c-string */
1122 strcpy(buffer, "NULL");
1126 endpos = UTF_END(u);
1129 while (utf_ptr < endpos) {
1130 /* copy next unicode character */
1131 u2 c = utf_nextu2(&utf_ptr);
1132 if (c == '/') c = '.';
1136 /* terminate string */
1141 /* utf_strcat_convert_to_latin1 ************************************************
1143 Like libc strcat, but uses an utf8 string.
1144 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1147 *******************************************************************************/
1149 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1151 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1155 /* utf_strcat_convert_to_latin1_classname **************************************
1157 Like libc strcat, but uses an utf8 string.
1158 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1161 *******************************************************************************/
1163 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1165 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1169 /* utf_fprint_printable_ascii **************************************************
1171 Write utf symbol into file.
1172 Non-printable and non-ASCII characters are printed as '?'.
1174 *******************************************************************************/
1176 void utf_fprint_printable_ascii(FILE *file, utf *u)
1178 char *endpos; /* points behind utf string */
1179 char *utf_ptr; /* current position in utf text */
1184 endpos = UTF_END(u);
1187 while (utf_ptr < endpos) {
1188 /* read next unicode character */
1189 u2 c = utf_nextu2(&utf_ptr);
1191 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1192 else fprintf(file, "?");
1197 /* utf_fprint_printable_ascii_classname ****************************************
1199 Write utf symbol into file with `/' converted to `.'.
1200 Non-printable and non-ASCII characters are printed as '?'.
1202 *******************************************************************************/
1204 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1206 char *endpos; /* points behind utf string */
1207 char *utf_ptr; /* current position in utf text */
1212 endpos = UTF_END(u);
1215 while (utf_ptr < endpos) {
1216 /* read next unicode character */
1217 u2 c = utf_nextu2(&utf_ptr);
1218 if (c == '/') c = '.';
1220 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1221 else fprintf(file, "?");
1226 /* is_valid_utf ****************************************************************
1228 Return true if the given string is a valid UTF-8 string.
1230 utf_ptr...points to first character
1231 end_pos...points after last character
1233 *******************************************************************************/
1235 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1237 bool is_valid_utf(char *utf_ptr, char *end_pos)
1244 if (end_pos < utf_ptr) return false;
1245 bytes = end_pos - utf_ptr;
1249 if (!c) return false; /* 0x00 is not allowed */
1250 if ((c & 0x80) == 0) continue; /* ASCII */
1252 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1253 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1254 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1255 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1256 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1257 else return false; /* invalid leading byte */
1259 if (len > 2) return false; /* Java limitation */
1261 v = (unsigned long)c & (0x3f >> len);
1263 if ((bytes -= len) < 0) return false; /* missing bytes */
1265 for (i = len; i--; ) {
1267 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1269 v = (v << 6) | (c & 0x3f);
1273 if (len != 1) return false; /* Java special */
1276 /* Sun Java seems to allow overlong UTF-8 encodings */
1278 /* if (v < min_codepoint[len]) */
1279 /* XXX throw exception? */
1282 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1283 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1285 /* even these seem to be allowed */
1286 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1293 /* is_valid_name ***************************************************************
1295 Return true if the given string may be used as a class/field/method
1296 name. (Currently this only disallows empty strings and control
1299 NOTE: The string is assumed to have passed is_valid_utf!
1301 utf_ptr...points to first character
1302 end_pos...points after last character
1304 *******************************************************************************/
1306 bool is_valid_name(char *utf_ptr, char *end_pos)
1308 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1310 while (utf_ptr < end_pos) {
1311 unsigned char c = *utf_ptr++;
1313 if (c < 0x20) return false; /* disallow control characters */
1314 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1321 bool is_valid_name_utf(utf *u)
1323 return is_valid_name(u->text, UTF_END(u));
1327 /* utf_show ********************************************************************
1329 Writes the utf symbols in the utfhash to stdout and displays the
1330 number of external hash chains grouped according to the chainlength
1331 (for debugging purposes).
1333 *******************************************************************************/
1335 #if !defined(NDEBUG)
1339 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1341 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1342 u4 max_chainlength = 0; /* maximum length of the chains */
1343 u4 sum_chainlength = 0; /* sum of the chainlengths */
1344 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1347 printf("UTF-HASH:\n");
1349 /* show element of utf-hashtable */
1351 for (i = 0; i < hashtable_utf->size; i++) {
1352 utf *u = hashtable_utf->ptr[i];
1355 printf("SLOT %d: ", (int) i);
1359 utf_display_printable_ascii(u);
1367 printf("UTF-HASH: %d slots for %d entries\n",
1368 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1370 if (hashtable_utf->entries == 0)
1373 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1375 for (i=0;i<CHAIN_LIMIT;i++)
1378 /* count numbers of hashchains according to their length */
1379 for (i=0; i<hashtable_utf->size; i++) {
1381 utf *u = (utf*) hashtable_utf->ptr[i];
1382 u4 chain_length = 0;
1384 /* determine chainlength */
1390 /* update sum of all chainlengths */
1391 sum_chainlength+=chain_length;
1393 /* determine the maximum length of the chains */
1394 if (chain_length>max_chainlength)
1395 max_chainlength = chain_length;
1397 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1398 if (chain_length>=CHAIN_LIMIT) {
1399 beyond_limit+=chain_length;
1400 chain_length=CHAIN_LIMIT-1;
1403 /* update number of hashchains of current length */
1404 chain_count[chain_length]++;
1407 /* display results */
1408 for (i=1;i<CHAIN_LIMIT-1;i++)
1409 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1411 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1414 printf("max. chainlength:%5d\n",max_chainlength);
1416 /* avg. chainlength = sum of chainlengths / number of chains */
1417 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1419 #endif /* !defined(NDEBUG) */
1423 * These are local overrides for various environment variables in Emacs.
1424 * Please do not remove this and leave it at the end of the file, where
1425 * Emacs will automagically detect them.
1426 * ---------------------------------------------------------------------
1429 * indent-tabs-mode: t
1433 * vim:noexpandtab:sw=4:ts=4: