1 /* src/vm/utf8.c - utf8 string functions
3 Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4 C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5 E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6 J. Wenninger, Institut f. Computersprachen - TU Wien
8 This file is part of CACAO.
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License as
12 published by the Free Software Foundation; either version 2, or (at
13 your option) any later version.
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25 Contact: cacao@cacaojvm.org
27 Authors: Reinhard Grafl
34 $Id: utf8.c 5123 2006-07-12 21:45:34Z twisti $
46 #include "mm/memory.h"
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/lock.h"
51 # include "threads/none/lock.h"
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
63 /* global variables ***********************************************************/
65 /* hashsize must be power of 2 */
67 #define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */
69 hashtable *hashtable_utf; /* hashtable for utf8-symbols */
72 /* utf-symbols for pointer comparison of frequently used strings **************/
74 utf *utf_java_lang_Object;
76 utf *utf_java_lang_Class;
77 utf *utf_java_lang_ClassLoader;
78 utf *utf_java_lang_Cloneable;
79 utf *utf_java_lang_SecurityManager;
80 utf *utf_java_lang_String;
81 utf *utf_java_lang_System;
82 utf *utf_java_lang_ThreadGroup;
83 utf *utf_java_io_Serializable;
85 utf *utf_java_lang_Throwable;
86 utf *utf_java_lang_VMThrowable;
87 utf *utf_java_lang_Error;
88 utf *utf_java_lang_AbstractMethodError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchMethodError;
92 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ClassCastException;
96 utf *utf_java_lang_ClassNotFoundException;
97 utf *utf_java_lang_IllegalArgumentException;
98 utf *utf_java_lang_IllegalMonitorStateException;
100 utf *utf_java_lang_NullPointerException;
102 utf* utf_java_lang_Void;
103 utf* utf_java_lang_Boolean;
104 utf* utf_java_lang_Byte;
105 utf* utf_java_lang_Character;
106 utf* utf_java_lang_Short;
107 utf* utf_java_lang_Integer;
108 utf* utf_java_lang_Long;
109 utf* utf_java_lang_Float;
110 utf* utf_java_lang_Double;
112 utf *utf_java_lang_StackTraceElement;
113 utf *utf_java_lang_reflect_Constructor;
114 utf *utf_java_lang_reflect_Field;
115 utf *utf_java_lang_reflect_Method;
116 utf *utf_java_util_Vector;
118 utf *utf_InnerClasses; /* InnerClasses */
119 utf *utf_ConstantValue; /* ConstantValue */
120 utf *utf_Code; /* Code */
121 utf *utf_Exceptions; /* Exceptions */
122 utf *utf_LineNumberTable; /* LineNumberTable */
123 utf *utf_SourceFile; /* SourceFile */
125 utf *utf_init; /* <init> */
126 utf *utf_clinit; /* <clinit> */
127 utf *utf_clone; /* clone */
128 utf *utf_finalize; /* finalize */
129 utf *utf_run; /* run */
131 utf *utf_add; /* add */
132 utf *utf_remove; /* remove */
133 utf *utf_put; /* put */
134 utf *utf_get; /* get */
135 utf *utf_value; /* value */
137 utf *utf_fillInStackTrace;
138 utf *utf_getSystemClassLoader;
140 utf *utf_printStackTrace;
151 utf *utf_void__void; /* ()V */
152 utf *utf_boolean__void; /* (Z)V */
153 utf *utf_byte__void; /* (B)V */
154 utf *utf_char__void; /* (C)V */
155 utf *utf_short__void; /* (S)V */
156 utf *utf_int__void; /* (I)V */
157 utf *utf_long__void; /* (J)V */
158 utf *utf_float__void; /* (F)V */
159 utf *utf_double__void; /* (D)V */
161 utf *utf_void__java_lang_ClassLoader; /* ()Ljava/lang/ClassLoader; */
162 utf *utf_void__java_lang_Object; /* ()Ljava/lang/Object; */
163 utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */
164 utf *utf_java_lang_Object__java_lang_Object;
165 utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */
166 utf *utf_java_lang_String__java_lang_Class;
167 utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */
169 utf *utf_not_named_yet; /* special name for unnamed classes */
171 utf *array_packagename;
174 /* utf_init ********************************************************************
176 Initializes the utf8 subsystem.
178 *******************************************************************************/
182 /* create utf8 hashtable */
184 hashtable_utf = NEW(hashtable);
186 hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
188 #if defined(ENABLE_STATISTICS)
190 count_utf_len += sizeof(utf*) * hashtable_utf->size;
193 /* create utf-symbols for pointer comparison of frequently used strings */
195 utf_java_lang_Object = utf_new_char("java/lang/Object");
197 utf_java_lang_Class = utf_new_char("java/lang/Class");
198 utf_java_lang_ClassLoader = utf_new_char("java/lang/ClassLoader");
199 utf_java_lang_Cloneable = utf_new_char("java/lang/Cloneable");
200 utf_java_lang_SecurityManager = utf_new_char("java/lang/SecurityManager");
201 utf_java_lang_String = utf_new_char("java/lang/String");
202 utf_java_lang_System = utf_new_char("java/lang/System");
203 utf_java_lang_ThreadGroup = utf_new_char("java/lang/ThreadGroup");
204 utf_java_io_Serializable = utf_new_char("java/io/Serializable");
206 utf_java_lang_Throwable = utf_new_char(string_java_lang_Throwable);
207 utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable);
208 utf_java_lang_Error = utf_new_char(string_java_lang_Error);
210 utf_java_lang_AbstractMethodError =
211 utf_new_char(string_java_lang_AbstractMethodError);
213 utf_java_lang_LinkageError =
214 utf_new_char(string_java_lang_LinkageError);
216 utf_java_lang_NoClassDefFoundError =
217 utf_new_char(string_java_lang_NoClassDefFoundError);
219 utf_java_lang_NoSuchMethodError =
220 utf_new_char(string_java_lang_NoSuchMethodError);
222 utf_java_lang_OutOfMemoryError =
223 utf_new_char(string_java_lang_OutOfMemoryError);
225 utf_java_lang_Exception = utf_new_char(string_java_lang_Exception);
227 utf_java_lang_ClassCastException =
228 utf_new_char(string_java_lang_ClassCastException);
230 utf_java_lang_ClassNotFoundException =
231 utf_new_char(string_java_lang_ClassNotFoundException);
233 utf_java_lang_IllegalArgumentException =
234 utf_new_char(string_java_lang_IllegalArgumentException);
236 utf_java_lang_IllegalMonitorStateException =
237 utf_new_char(string_java_lang_IllegalMonitorStateException);
239 utf_java_lang_NullPointerException =
240 utf_new_char(string_java_lang_NullPointerException);
242 utf_java_lang_Void = utf_new_char("java/lang/Void");
243 utf_java_lang_Boolean = utf_new_char("java/lang/Boolean");
244 utf_java_lang_Byte = utf_new_char("java/lang/Byte");
245 utf_java_lang_Character = utf_new_char("java/lang/Character");
246 utf_java_lang_Short = utf_new_char("java/lang/Short");
247 utf_java_lang_Integer = utf_new_char("java/lang/Integer");
248 utf_java_lang_Long = utf_new_char("java/lang/Long");
249 utf_java_lang_Float = utf_new_char("java/lang/Float");
250 utf_java_lang_Double = utf_new_char("java/lang/Double");
252 utf_java_lang_StackTraceElement =
253 utf_new_char("java/lang/StackTraceElement");
255 utf_java_lang_reflect_Constructor =
256 utf_new_char("java/lang/reflect/Constructor");
258 utf_java_lang_reflect_Field = utf_new_char("java/lang/reflect/Field");
259 utf_java_lang_reflect_Method = utf_new_char("java/lang/reflect/Method");
260 utf_java_util_Vector = utf_new_char("java/util/Vector");
262 utf_InnerClasses = utf_new_char("InnerClasses");
263 utf_ConstantValue = utf_new_char("ConstantValue");
264 utf_Code = utf_new_char("Code");
265 utf_Exceptions = utf_new_char("Exceptions");
266 utf_LineNumberTable = utf_new_char("LineNumberTable");
267 utf_SourceFile = utf_new_char("SourceFile");
269 utf_init = utf_new_char("<init>");
270 utf_clinit = utf_new_char("<clinit>");
271 utf_clone = utf_new_char("clone");
272 utf_finalize = utf_new_char("finalize");
273 utf_run = utf_new_char("run");
275 utf_add = utf_new_char("add");
276 utf_remove = utf_new_char("remove");
277 utf_put = utf_new_char("put");
278 utf_get = utf_new_char("get");
279 utf_value = utf_new_char("value");
281 utf_printStackTrace = utf_new_char("printStackTrace");
282 utf_fillInStackTrace = utf_new_char("fillInStackTrace");
283 utf_loadClass = utf_new_char("loadClass");
284 utf_getSystemClassLoader = utf_new_char("getSystemClassLoader");
286 utf_Z = utf_new_char("Z");
287 utf_B = utf_new_char("B");
288 utf_C = utf_new_char("C");
289 utf_S = utf_new_char("S");
290 utf_I = utf_new_char("I");
291 utf_J = utf_new_char("J");
292 utf_F = utf_new_char("F");
293 utf_D = utf_new_char("D");
295 utf_void__void = utf_new_char("()V");
296 utf_boolean__void = utf_new_char("(Z)V");
297 utf_byte__void = utf_new_char("(B)V");
298 utf_char__void = utf_new_char("(C)V");
299 utf_short__void = utf_new_char("(S)V");
300 utf_int__void = utf_new_char("(I)V");
301 utf_long__void = utf_new_char("(J)V");
302 utf_float__void = utf_new_char("(F)V");
303 utf_double__void = utf_new_char("(D)V");
304 utf_void__java_lang_Object = utf_new_char("()Ljava/lang/Object;");
305 utf_void__java_lang_Throwable = utf_new_char("()Ljava/lang/Throwable;");
307 utf_void__java_lang_ClassLoader =
308 utf_new_char("()Ljava/lang/ClassLoader;");
310 utf_java_lang_Object__java_lang_Object =
311 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
313 utf_java_lang_String__void = utf_new_char("(Ljava/lang/String;)V");
315 utf_java_lang_String__java_lang_Class =
316 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
318 utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V");
320 utf_null = utf_new_char("null");
321 utf_not_named_yet = utf_new_char("\t<not_named_yet>");
322 array_packagename = utf_new_char("\t<the array package>");
324 /* everything's ok */
330 /* utf_hashkey *****************************************************************
332 The hashkey is computed from the utf-text by using up to 8
333 characters. For utf-symbols longer than 15 characters 3 characters
334 are taken from the beginning and the end, 2 characters are taken
337 *******************************************************************************/
339 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val */
340 #define fbs(val) ((u4) *( text) << val) /* get first byte, left shift by val */
342 u4 utf_hashkey(const char *text, u4 length)
344 const char *start_pos = text; /* pointer to utf text */
348 case 0: /* empty string */
351 case 1: return fbs(0);
352 case 2: return fbs(0) ^ nbs(3);
353 case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
354 case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
355 case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
356 case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
357 case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
358 case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
365 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
374 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
383 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
395 return a ^ nbs(9) ^ nbs(10);
407 return a ^ nbs(9) ^ nbs(10);
418 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
429 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
431 default: /* 3 characters from beginning */
437 /* 2 characters from middle */
438 text = start_pos + (length / 2);
443 /* 3 characters from end */
444 text = start_pos + length - 4;
449 return a ^ nbs(10) ^ nbs(11);
453 /* utf_full_hashkey ************************************************************
455 This function computes a hash value using all bytes in the string.
457 The algorithm is the "One-at-a-time" algorithm as published
458 by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
460 *******************************************************************************/
462 u4 utf_full_hashkey(const char *text, u4 length)
464 register const unsigned char *p = (const unsigned char *) text;
472 hash += (hash << 10);
476 hash ^= (hash >> 11);
477 hash += (hash << 15);
482 /* unicode_hashkey *************************************************************
484 Compute the hashkey of a unicode string.
486 *******************************************************************************/
488 u4 unicode_hashkey(u2 *text, u2 len)
490 return utf_hashkey((char *) text, len);
494 /* utf_new *********************************************************************
496 Creates a new utf-symbol, the text of the symbol is passed as a
497 u1-array. The function searches the utf-hashtable for a utf-symbol
498 with this text. On success the element returned, otherwise a new
499 hashtable element is created.
501 If the number of entries in the hashtable exceeds twice the size of
502 the hashtable slots a reorganization of the hashtable is done and
503 the utf symbols are copied to a new hashtable with doubled size.
505 *******************************************************************************/
507 utf *utf_new(const char *text, u2 length)
509 u4 key; /* hashkey computed from utf-text */
510 u4 slot; /* slot in hashtable */
511 utf *u; /* hashtable element */
514 LOCK_MONITOR_ENTER(hashtable_utf->header);
516 #if defined(ENABLE_STATISTICS)
521 key = utf_hashkey(text, length);
522 slot = key & (hashtable_utf->size - 1);
523 u = hashtable_utf->ptr[slot];
525 /* search external hash chain for utf-symbol */
528 if (u->blength == length) {
529 /* compare text of hashtable elements */
531 for (i = 0; i < length; i++)
532 if (text[i] != u->text[i])
535 #if defined(ENABLE_STATISTICS)
537 count_utf_new_found++;
540 /* symbol found in hashtable */
542 LOCK_MONITOR_EXIT(hashtable_utf->header);
548 u = u->hashlink; /* next element in external chain */
551 #if defined(ENABLE_STATISTICS)
553 count_utf_len += sizeof(utf) + length + 1;
556 /* location in hashtable found, create new utf element */
558 u->blength = length; /* length in bytes of utfstring */
559 u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */
560 u->text = mem_alloc(length + 1);/* allocate memory for utf-text */
562 memcpy(u->text, text, length); /* copy utf-text */
563 u->text[length] = '\0';
565 hashtable_utf->ptr[slot] = u; /* insert symbol into table */
566 hashtable_utf->entries++; /* update number of entries */
568 if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
570 /* reorganization of hashtable, average length of the external
571 chains is approx. 2 */
573 hashtable *newhash; /* the new hashtable */
579 /* create new hashtable, double the size */
581 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
583 #if defined(ENABLE_STATISTICS)
585 count_utf_len += sizeof(utf*) * hashtable_utf->size;
588 /* transfer elements to new hashtable */
590 for (i = 0; i < hashtable_utf->size; i++) {
591 u = hashtable_utf->ptr[i];
595 slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
597 u->hashlink = (utf *) newhash->ptr[slot];
598 newhash->ptr[slot] = u;
600 /* follow link in external hash chain */
606 /* dispose old table */
608 hashtable_free(hashtable_utf);
610 hashtable_utf = newhash;
613 LOCK_MONITOR_EXIT(hashtable_utf->header);
619 /* utf_new_u2 ******************************************************************
621 Make utf symbol from u2 array, if isclassname is true '.' is
624 *******************************************************************************/
626 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
628 char *buffer; /* memory buffer for unicode characters */
629 char *pos; /* pointer to current position in buffer */
630 u4 left; /* unicode characters left */
631 u4 buflength; /* utf length in bytes of the u2 array */
632 utf *result; /* resulting utf-string */
635 /* determine utf length in bytes and allocate memory */
637 buflength = u2_utflength(unicode_pos, unicode_length);
638 buffer = MNEW(char, buflength);
643 for (i = 0; i++ < unicode_length; unicode_pos++) {
644 /* next unicode character */
647 if ((c != 0) && (c < 0x80)) {
650 if ((int) left < 0) break;
651 /* convert classname */
652 if (isclassname && c == '.')
657 } else if (c < 0x800) {
659 unsigned char high = c >> 6;
660 unsigned char low = c & 0x3F;
662 if ((int) left < 0) break;
663 *pos++ = high | 0xC0;
669 char mid = (c >> 6) & 0x3F;
672 if ((int) left < 0) break;
673 *pos++ = high | 0xE0;
679 /* insert utf-string into symbol-table */
680 result = utf_new(buffer,buflength);
682 MFREE(buffer, char, buflength);
688 /* utf_new_char ****************************************************************
690 Creates a new utf symbol, the text for this symbol is passed as a
691 c-string ( = char* ).
693 *******************************************************************************/
695 utf *utf_new_char(const char *text)
697 return utf_new(text, strlen(text));
701 /* utf_new_char_classname ******************************************************
703 Creates a new utf symbol, the text for this symbol is passed as a
704 c-string ( = char* ) "." characters are going to be replaced by
705 "/". Since the above function is used often, this is a separte
706 function, instead of an if.
708 *******************************************************************************/
710 utf *utf_new_char_classname(const char *text)
712 if (strchr(text, '.')) {
713 char *txt = strdup(text);
714 char *end = txt + strlen(txt);
718 for (c = txt; c < end; c++)
719 if (*c == '.') *c = '/';
721 tmpRes = utf_new(txt, strlen(txt));
727 return utf_new(text, strlen(text));
731 /* utf_nextu2 ******************************************************************
733 Read the next unicode character from the utf string and increment
734 the utf-string pointer accordingly.
736 *******************************************************************************/
738 u2 utf_nextu2(char **utf_ptr)
740 /* uncompressed unicode character */
742 /* current position in utf text */
743 unsigned char *utf = (unsigned char *) (*utf_ptr);
744 /* bytes representing the unicode character */
745 unsigned char ch1, ch2, ch3;
746 /* number of bytes used to represent the unicode character */
749 switch ((ch1 = utf[0]) >> 4) {
750 default: /* 1 byte */
754 case 0xD: /* 2 bytes */
755 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
756 unsigned char high = ch1 & 0x1F;
757 unsigned char low = ch2 & 0x3F;
758 unicode_char = (high << 6) + low;
763 case 0xE: /* 2 or 3 bytes */
764 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
765 if (((ch3 = utf[2]) & 0xC0) == 0x80) {
766 unsigned char low = ch3 & 0x3f;
767 unsigned char mid = ch2 & 0x3f;
768 unsigned char high = ch1 & 0x0f;
769 unicode_char = (((high << 6) + mid) << 6) + low;
777 /* update position in utf-text */
778 *utf_ptr = (char *) (utf + len);
784 /* utf_bytes *******************************************************************
786 Determine number of bytes (aka. octets) in the utf string.
789 u............utf string
792 The number of octets of this utf string.
793 There is _no_ terminating zero included in this count.
795 *******************************************************************************/
802 /* utf_get_number_of_u2s_for_buffer ********************************************
804 Determine number of UTF-16 u2s in the given UTF-8 buffer
806 CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
807 to an array of u2s (UTF-16) and want to know how many of them you will get.
808 All other uses of this function are probably wrong.
811 buffer........points to first char in buffer
812 blength.......number of _bytes_ in the buffer
815 the number of u2s needed to hold this string in UTF-16 encoding.
816 There is _no_ terminating zero included in this count.
818 NOTE: Unlike utf_get_number_of_u2s, this function never throws an
821 *******************************************************************************/
823 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
825 const char *endpos; /* points behind utf string */
826 const char *utf_ptr; /* current position in utf text */
827 u4 len = 0; /* number of unicode characters */
830 endpos = utf_ptr + blength;
832 while (utf_ptr < endpos) {
834 /* next unicode character */
835 utf_nextu2((char **)&utf_ptr);
838 assert(utf_ptr == endpos);
844 /* utf_get_number_of_u2s *******************************************************
846 Determine number of UTF-16 u2s in the utf string.
848 CAUTION: Use this function *only* when you want to convert a utf string
849 to an array of u2s and want to know how many of them you will get.
850 All other uses of this function are probably wrong.
853 u............utf string
856 the number of u2s needed to hold this string in UTF-16 encoding.
857 There is _no_ terminating zero included in this count.
858 XXX 0 if a NullPointerException has been thrown (see below)
860 *******************************************************************************/
862 u4 utf_get_number_of_u2s(utf *u)
864 char *endpos; /* points behind utf string */
865 char *utf_ptr; /* current position in utf text */
866 u4 len = 0; /* number of unicode characters */
868 /* XXX this is probably not checked by most callers! Review this after */
869 /* the invalid uses of this function have been eliminated */
871 exceptions_throw_nullpointerexception();
878 while (utf_ptr < endpos) {
880 /* next unicode character */
881 utf_nextu2(&utf_ptr);
884 if (utf_ptr != endpos)
885 /* string ended abruptly */
886 throw_cacao_exception_exit(string_java_lang_InternalError,
887 "Illegal utf8 string");
893 /* u2_utflength ****************************************************************
895 Returns the utf length in bytes of a u2 array.
897 *******************************************************************************/
899 u4 u2_utflength(u2 *text, u4 u2_length)
901 u4 result_len = 0; /* utf length in bytes */
902 u2 ch; /* current unicode character */
905 for (len = 0; len < u2_length; len++) {
906 /* next unicode character */
909 /* determine bytes required to store unicode character as utf */
910 if (ch && (ch < 0x80))
922 /* utf_copy ********************************************************************
924 Copy the given utf string byte-for-byte to a buffer.
927 buffer.......the buffer
928 u............the utf string
930 *******************************************************************************/
932 void utf_copy(char *buffer, utf *u)
934 /* our utf strings are zero-terminated (done by utf_new) */
935 MCOPY(buffer, u->text, char, u->blength + 1);
939 /* utf_cat *********************************************************************
941 Append the given utf string byte-for-byte to a buffer.
944 buffer.......the buffer
945 u............the utf string
947 *******************************************************************************/
949 void utf_cat(char *buffer, utf *u)
951 /* our utf strings are zero-terminated (done by utf_new) */
952 MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
956 /* utf_copy_classname **********************************************************
958 Copy the given utf classname byte-for-byte to a buffer.
959 '/' is replaced by '.'
962 buffer.......the buffer
963 u............the utf string
965 *******************************************************************************/
967 void utf_copy_classname(char *buffer, utf *u)
976 endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
978 while (srcptr != endptr) {
987 /* utf_cat *********************************************************************
989 Append the given utf classname byte-for-byte to a buffer.
990 '/' is replaced by '.'
993 buffer.......the buffer
994 u............the utf string
996 *******************************************************************************/
998 void utf_cat_classname(char *buffer, utf *u)
1000 utf_copy_classname(buffer + strlen(buffer), u);
1003 /* utf_display_printable_ascii *************************************************
1005 Write utf symbol to stdout (for debugging purposes).
1006 Non-printable and non-ASCII characters are printed as '?'.
1008 *******************************************************************************/
1010 void utf_display_printable_ascii(utf *u)
1012 char *endpos; /* points behind utf string */
1013 char *utf_ptr; /* current position in utf text */
1021 endpos = UTF_END(u);
1024 while (utf_ptr < endpos) {
1025 /* read next unicode character */
1027 u2 c = utf_nextu2(&utf_ptr);
1029 if ((c >= 32) && (c <= 127))
1039 /* utf_display_printable_ascii_classname ***************************************
1041 Write utf symbol to stdout with `/' converted to `.' (for debugging
1043 Non-printable and non-ASCII characters are printed as '?'.
1045 *******************************************************************************/
1047 void utf_display_printable_ascii_classname(utf *u)
1049 char *endpos; /* points behind utf string */
1050 char *utf_ptr; /* current position in utf text */
1058 endpos = UTF_END(u);
1061 while (utf_ptr < endpos) {
1062 /* read next unicode character */
1064 u2 c = utf_nextu2(&utf_ptr);
1069 if ((c >= 32) && (c <= 127))
1079 /* utf_sprint_convert_to_latin1 ************************************************
1081 Write utf symbol into c-string (for debugging purposes).
1082 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1085 *******************************************************************************/
1087 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1089 char *endpos; /* points behind utf string */
1090 char *utf_ptr; /* current position in utf text */
1091 u2 pos = 0; /* position in c-string */
1094 strcpy(buffer, "NULL");
1098 endpos = UTF_END(u);
1101 while (utf_ptr < endpos)
1102 /* copy next unicode character */
1103 buffer[pos++] = utf_nextu2(&utf_ptr);
1105 /* terminate string */
1110 /* utf_sprint_convert_to_latin1_classname **************************************
1112 Write utf symbol into c-string with `/' converted to `.' (for debugging
1114 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1117 *******************************************************************************/
1119 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1121 char *endpos; /* points behind utf string */
1122 char *utf_ptr; /* current position in utf text */
1123 u2 pos = 0; /* position in c-string */
1126 strcpy(buffer, "NULL");
1130 endpos = UTF_END(u);
1133 while (utf_ptr < endpos) {
1134 /* copy next unicode character */
1135 u2 c = utf_nextu2(&utf_ptr);
1136 if (c == '/') c = '.';
1140 /* terminate string */
1145 /* utf_strcat_convert_to_latin1 ************************************************
1147 Like libc strcat, but uses an utf8 string.
1148 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1151 *******************************************************************************/
1153 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1155 utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1159 /* utf_strcat_convert_to_latin1_classname **************************************
1161 Like libc strcat, but uses an utf8 string.
1162 Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1165 *******************************************************************************/
1167 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1169 utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1173 /* utf_fprint_printable_ascii **************************************************
1175 Write utf symbol into file.
1176 Non-printable and non-ASCII characters are printed as '?'.
1178 *******************************************************************************/
1180 void utf_fprint_printable_ascii(FILE *file, utf *u)
1182 char *endpos; /* points behind utf string */
1183 char *utf_ptr; /* current position in utf text */
1188 endpos = UTF_END(u);
1191 while (utf_ptr < endpos) {
1192 /* read next unicode character */
1193 u2 c = utf_nextu2(&utf_ptr);
1195 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1196 else fprintf(file, "?");
1201 /* utf_fprint_printable_ascii_classname ****************************************
1203 Write utf symbol into file with `/' converted to `.'.
1204 Non-printable and non-ASCII characters are printed as '?'.
1206 *******************************************************************************/
1208 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1210 char *endpos; /* points behind utf string */
1211 char *utf_ptr; /* current position in utf text */
1216 endpos = UTF_END(u);
1219 while (utf_ptr < endpos) {
1220 /* read next unicode character */
1221 u2 c = utf_nextu2(&utf_ptr);
1222 if (c == '/') c = '.';
1224 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1225 else fprintf(file, "?");
1230 /* is_valid_utf ****************************************************************
1232 Return true if the given string is a valid UTF-8 string.
1234 utf_ptr...points to first character
1235 end_pos...points after last character
1237 *******************************************************************************/
1239 /* static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1241 bool is_valid_utf(char *utf_ptr, char *end_pos)
1248 if (end_pos < utf_ptr) return false;
1249 bytes = end_pos - utf_ptr;
1253 if (!c) return false; /* 0x00 is not allowed */
1254 if ((c & 0x80) == 0) continue; /* ASCII */
1256 if ((c & 0xe0) == 0xc0) len = 1; /* 110x xxxx */
1257 else if ((c & 0xf0) == 0xe0) len = 2; /* 1110 xxxx */
1258 else if ((c & 0xf8) == 0xf0) len = 3; /* 1111 0xxx */
1259 else if ((c & 0xfc) == 0xf8) len = 4; /* 1111 10xx */
1260 else if ((c & 0xfe) == 0xfc) len = 5; /* 1111 110x */
1261 else return false; /* invalid leading byte */
1263 if (len > 2) return false; /* Java limitation */
1265 v = (unsigned long)c & (0x3f >> len);
1267 if ((bytes -= len) < 0) return false; /* missing bytes */
1269 for (i = len; i--; ) {
1271 if ((c & 0xc0) != 0x80) /* 10xx xxxx */
1273 v = (v << 6) | (c & 0x3f);
1277 if (len != 1) return false; /* Java special */
1280 /* Sun Java seems to allow overlong UTF-8 encodings */
1282 /* if (v < min_codepoint[len]) */
1283 /* XXX throw exception? */
1286 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1287 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1289 /* even these seem to be allowed */
1290 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1297 /* is_valid_name ***************************************************************
1299 Return true if the given string may be used as a class/field/method
1300 name. (Currently this only disallows empty strings and control
1303 NOTE: The string is assumed to have passed is_valid_utf!
1305 utf_ptr...points to first character
1306 end_pos...points after last character
1308 *******************************************************************************/
1310 bool is_valid_name(char *utf_ptr, char *end_pos)
1312 if (end_pos <= utf_ptr) return false; /* disallow empty names */
1314 while (utf_ptr < end_pos) {
1315 unsigned char c = *utf_ptr++;
1317 if (c < 0x20) return false; /* disallow control characters */
1318 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80) /* disallow zero */
1325 bool is_valid_name_utf(utf *u)
1327 return is_valid_name(u->text, UTF_END(u));
1331 /* utf_show ********************************************************************
1333 Writes the utf symbols in the utfhash to stdout and displays the
1334 number of external hash chains grouped according to the chainlength
1335 (for debugging purposes).
1337 *******************************************************************************/
1339 #if !defined(NDEBUG)
1343 #define CHAIN_LIMIT 20 /* limit for seperated enumeration */
1345 u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1346 u4 max_chainlength = 0; /* maximum length of the chains */
1347 u4 sum_chainlength = 0; /* sum of the chainlengths */
1348 u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1351 printf("UTF-HASH:\n");
1353 /* show element of utf-hashtable */
1355 for (i = 0; i < hashtable_utf->size; i++) {
1356 utf *u = hashtable_utf->ptr[i];
1359 printf("SLOT %d: ", (int) i);
1363 utf_display_printable_ascii(u);
1371 printf("UTF-HASH: %d slots for %d entries\n",
1372 (int) hashtable_utf->size, (int) hashtable_utf->entries );
1374 if (hashtable_utf->entries == 0)
1377 printf("chains:\n chainlength number of chains %% of utfstrings\n");
1379 for (i=0;i<CHAIN_LIMIT;i++)
1382 /* count numbers of hashchains according to their length */
1383 for (i=0; i<hashtable_utf->size; i++) {
1385 utf *u = (utf*) hashtable_utf->ptr[i];
1386 u4 chain_length = 0;
1388 /* determine chainlength */
1394 /* update sum of all chainlengths */
1395 sum_chainlength+=chain_length;
1397 /* determine the maximum length of the chains */
1398 if (chain_length>max_chainlength)
1399 max_chainlength = chain_length;
1401 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1402 if (chain_length>=CHAIN_LIMIT) {
1403 beyond_limit+=chain_length;
1404 chain_length=CHAIN_LIMIT-1;
1407 /* update number of hashchains of current length */
1408 chain_count[chain_length]++;
1411 /* display results */
1412 for (i=1;i<CHAIN_LIMIT-1;i++)
1413 printf(" %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1415 printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1418 printf("max. chainlength:%5d\n",max_chainlength);
1420 /* avg. chainlength = sum of chainlengths / number of chains */
1421 printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1423 #endif /* !defined(NDEBUG) */
1427 * These are local overrides for various environment variables in Emacs.
1428 * Please do not remove this and leave it at the end of the file, where
1429 * Emacs will automagically detect them.
1430 * ---------------------------------------------------------------------
1433 * indent-tabs-mode: t
1437 * vim:noexpandtab:sw=4:ts=4: