* src/vm/jit/codegen-common.cpp, src/vm/jit/x86_64/codegen.c: Generate
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007, 2008
4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5
6    This file is part of CACAO.
7
8    This program is free software; you can redistribute it and/or
9    modify it under the terms of the GNU General Public License as
10    published by the Free Software Foundation; either version 2, or (at
11    your option) any later version.
12
13    This program is distributed in the hope that it will be useful, but
14    WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16    General Public License for more details.
17
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21    02110-1301, USA.
22
23 */
24
25
26 #include "config.h"
27
28 #include <string.h>
29 #include <assert.h>
30
31 #include "vm/types.h"
32
33 #include "mm/memory.hpp"
34
35 #include "threads/mutex.hpp"
36
37 #include "toolbox/hashtable.h"
38
39 #include "vm/exceptions.hpp"
40 #include "vm/options.h"
41
42 #if defined(ENABLE_STATISTICS)
43 # include "vm/statistics.h"
44 #endif
45
46 #include "vm/utf8.h"
47
48
49 /* global variables ***********************************************************/
50
51 /* hashsize must be power of 2 */
52
53 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
54
55 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
56
57
58 /* utf-symbols for pointer comparison of frequently used strings **************/
59
60 utf *utf_java_lang_Object;
61
62 utf *utf_java_lang_Class;
63 utf *utf_java_lang_ClassLoader;
64 utf *utf_java_lang_Cloneable;
65 utf *utf_java_lang_SecurityManager;
66 utf *utf_java_lang_String;
67 utf *utf_java_lang_ThreadGroup;
68 utf *utf_java_lang_ref_SoftReference;
69 utf *utf_java_lang_ref_WeakReference;
70 utf *utf_java_lang_ref_PhantomReference;
71 utf *utf_java_io_Serializable;
72
73 utf *utf_java_lang_Throwable;
74 utf *utf_java_lang_Error;
75
76 utf *utf_java_lang_AbstractMethodError;
77 utf *utf_java_lang_ClassCircularityError;
78 utf *utf_java_lang_ClassFormatError;
79 utf *utf_java_lang_ExceptionInInitializerError;
80 utf *utf_java_lang_IncompatibleClassChangeError;
81 utf *utf_java_lang_InstantiationError;
82 utf *utf_java_lang_InternalError;
83 utf *utf_java_lang_LinkageError;
84 utf *utf_java_lang_NoClassDefFoundError;
85 utf *utf_java_lang_NoSuchFieldError;
86 utf *utf_java_lang_NoSuchMethodError;
87 utf *utf_java_lang_OutOfMemoryError;
88 utf *utf_java_lang_UnsatisfiedLinkError;
89 utf *utf_java_lang_UnsupportedClassVersionError;
90 utf *utf_java_lang_VerifyError;
91 utf *utf_java_lang_VirtualMachineError;
92
93 utf *utf_java_lang_Exception;
94
95 utf *utf_java_lang_ArithmeticException;
96 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
97 utf *utf_java_lang_ArrayStoreException;
98 utf *utf_java_lang_ClassCastException;
99 utf *utf_java_lang_ClassNotFoundException;
100 utf *utf_java_lang_CloneNotSupportedException;
101 utf *utf_java_lang_IllegalAccessException;
102 utf *utf_java_lang_IllegalArgumentException;
103 utf *utf_java_lang_IllegalMonitorStateException;
104 utf *utf_java_lang_InstantiationException;
105 utf *utf_java_lang_InterruptedException;
106 utf *utf_java_lang_NegativeArraySizeException;
107 utf *utf_java_lang_NullPointerException;
108 utf *utf_java_lang_RuntimeException;
109 utf *utf_java_lang_StringIndexOutOfBoundsException;
110
111 utf *utf_java_lang_reflect_InvocationTargetException;
112
113 utf *utf_java_security_PrivilegedActionException;
114
115 #if defined(ENABLE_JAVASE)
116 utf* utf_java_lang_Void;
117 #endif
118
119 utf* utf_java_lang_Boolean;
120 utf* utf_java_lang_Byte;
121 utf* utf_java_lang_Character;
122 utf* utf_java_lang_Short;
123 utf* utf_java_lang_Integer;
124 utf* utf_java_lang_Long;
125 utf* utf_java_lang_Float;
126 utf* utf_java_lang_Double;
127
128 #if defined(ENABLE_JAVASE)
129 utf *utf_java_lang_StackTraceElement;
130 utf *utf_java_lang_reflect_Constructor;
131 utf *utf_java_lang_reflect_Field;
132 utf *utf_java_lang_reflect_Method;
133
134 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
135 utf *utf_java_lang_reflect_VMConstructor;
136 utf *utf_java_lang_reflect_VMField;
137 utf *utf_java_lang_reflect_VMMethod;
138 # endif
139
140 utf *utf_java_util_Vector;
141 #endif
142
143 utf *utf_InnerClasses;                  /* InnerClasses                       */
144 utf *utf_ConstantValue;                 /* ConstantValue                      */
145 utf *utf_Code;                          /* Code                               */
146 utf *utf_Exceptions;                    /* Exceptions                         */
147 utf *utf_LineNumberTable;               /* LineNumberTable                    */
148 utf *utf_SourceFile;                    /* SourceFile                         */
149
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
152 utf *utf_Signature;
153 utf *utf_StackMapTable;
154
155 #if defined(ENABLE_ANNOTATIONS)
156 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
157 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
158 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
159 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
160 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
161 #endif
162 #endif
163
164 utf *utf_init;                          /* <init>                             */
165 utf *utf_clinit;                        /* <clinit>                           */
166 utf *utf_clone;                         /* clone                              */
167 utf *utf_finalize;                      /* finalize                           */
168 utf *utf_invoke;
169 utf *utf_main;
170 utf *utf_run;                           /* run                                */
171
172 utf *utf_add;
173 utf *utf_dispatch;
174 utf *utf_remove;
175 utf *utf_addThread;
176 utf *utf_removeThread;
177 utf *utf_put;
178 utf *utf_get;
179 utf *utf_uncaughtException;
180 utf *utf_value;
181
182 utf *utf_fillInStackTrace;
183 utf *utf_findNative;
184 utf *utf_getSystemClassLoader;
185 utf *utf_initCause;
186 utf *utf_loadClass;
187 utf *utf_loadClassInternal;
188 utf *utf_printStackTrace;
189
190 utf *utf_division_by_zero;
191
192 utf *utf_Z;                             /* Z                                  */
193 utf *utf_B;                             /* B                                  */
194 utf *utf_C;                             /* C                                  */
195 utf *utf_S;                             /* S                                  */
196 utf *utf_I;                             /* I                                  */
197 utf *utf_J;                             /* J                                  */
198 utf *utf_F;                             /* F                                  */
199 utf *utf_D;                             /* D                                  */
200
201 utf *utf_void__void;                    /* ()V                                */
202 utf *utf_boolean__void;                 /* (Z)V                               */
203 utf *utf_byte__void;                    /* (B)V                               */
204 utf *utf_char__void;                    /* (C)V                               */
205 utf *utf_short__void;                   /* (S)V                               */
206 utf *utf_int__void;                     /* (I)V                               */
207 utf *utf_long__void;                    /* (J)V                               */
208 utf *utf_float__void;                   /* (F)V                               */
209 utf *utf_double__void;                  /* (D)V                               */
210
211 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
212 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
213 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
214 utf *utf_java_lang_ClassLoader_java_lang_String__J;
215 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
216 utf *utf_java_lang_Object__java_lang_Object;
217 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
218 utf *utf_java_lang_String__java_lang_Class;
219 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
220 utf *utf_java_lang_Thread_java_lang_Throwable__V;
221 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
222 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
223 utf *utf_java_lang_Throwable__java_lang_Throwable;
224
225 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
226 utf *utf_null;
227 utf *array_packagename;
228
229
230 /* utf_init ********************************************************************
231
232    Initializes the utf8 subsystem.
233
234 *******************************************************************************/
235
236 void utf8_init(void)
237 {
238         TRACESUBSYSTEMINITIALIZATION("utf8_init");
239
240         /* create utf8 hashtable */
241
242         hashtable_utf = NEW(hashtable);
243
244         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
245
246 #if defined(ENABLE_STATISTICS)
247         if (opt_stat)
248                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
249 #endif
250
251         /* create utf-symbols for pointer comparison of frequently used strings */
252
253         utf_java_lang_Object           = utf_new_char("java/lang/Object");
254
255         utf_java_lang_Class            = utf_new_char("java/lang/Class");
256         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
257         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
258         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
259         utf_java_lang_String           = utf_new_char("java/lang/String");
260         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
261
262         utf_java_lang_ref_SoftReference =
263                 utf_new_char("java/lang/ref/SoftReference");
264
265         utf_java_lang_ref_WeakReference =
266                 utf_new_char("java/lang/ref/WeakReference");
267
268         utf_java_lang_ref_PhantomReference =
269                 utf_new_char("java/lang/ref/PhantomReference");
270
271         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
272
273         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
274         utf_java_lang_Error            = utf_new_char("java/lang/Error");
275
276         utf_java_lang_ClassCircularityError =
277                 utf_new_char("java/lang/ClassCircularityError");
278
279         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
280
281         utf_java_lang_ExceptionInInitializerError =
282                 utf_new_char("java/lang/ExceptionInInitializerError");
283
284         utf_java_lang_IncompatibleClassChangeError =
285                 utf_new_char("java/lang/IncompatibleClassChangeError");
286
287         utf_java_lang_InstantiationError =
288                 utf_new_char("java/lang/InstantiationError");
289
290         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
291         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
292
293         utf_java_lang_NoClassDefFoundError =
294                 utf_new_char("java/lang/NoClassDefFoundError");
295
296         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
297
298         utf_java_lang_UnsatisfiedLinkError =
299                 utf_new_char("java/lang/UnsatisfiedLinkError");
300
301         utf_java_lang_UnsupportedClassVersionError =
302                 utf_new_char("java/lang/UnsupportedClassVersionError");
303
304         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
305
306         utf_java_lang_VirtualMachineError =
307                 utf_new_char("java/lang/VirtualMachineError");
308
309 #if defined(ENABLE_JAVASE)
310         utf_java_lang_AbstractMethodError =
311                 utf_new_char("java/lang/AbstractMethodError");
312
313         utf_java_lang_NoSuchFieldError =
314                 utf_new_char("java/lang/NoSuchFieldError");
315
316         utf_java_lang_NoSuchMethodError =
317                 utf_new_char("java/lang/NoSuchMethodError");
318 #endif
319
320         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
321
322         utf_java_lang_ArithmeticException =
323                 utf_new_char("java/lang/ArithmeticException");
324
325         utf_java_lang_ArrayIndexOutOfBoundsException =
326                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
327
328         utf_java_lang_ArrayStoreException =
329                 utf_new_char("java/lang/ArrayStoreException");
330
331         utf_java_lang_ClassCastException =
332                 utf_new_char("java/lang/ClassCastException");
333
334         utf_java_lang_ClassNotFoundException =
335                 utf_new_char("java/lang/ClassNotFoundException");
336
337         utf_java_lang_CloneNotSupportedException =
338                 utf_new_char("java/lang/CloneNotSupportedException");
339
340         utf_java_lang_IllegalAccessException =
341                 utf_new_char("java/lang/IllegalAccessException");
342
343         utf_java_lang_IllegalArgumentException =
344                 utf_new_char("java/lang/IllegalArgumentException");
345
346         utf_java_lang_IllegalMonitorStateException =
347                 utf_new_char("java/lang/IllegalMonitorStateException");
348
349         utf_java_lang_InstantiationException =
350                 utf_new_char("java/lang/InstantiationException");
351
352         utf_java_lang_InterruptedException =
353                 utf_new_char("java/lang/InterruptedException");
354
355         utf_java_lang_NegativeArraySizeException =
356                 utf_new_char("java/lang/NegativeArraySizeException");
357
358         utf_java_lang_NullPointerException =
359                 utf_new_char("java/lang/NullPointerException");
360
361         utf_java_lang_RuntimeException =
362                 utf_new_char("java/lang/RuntimeException");
363
364         utf_java_lang_StringIndexOutOfBoundsException =
365                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
366
367         utf_java_lang_reflect_InvocationTargetException =
368                 utf_new_char("java/lang/reflect/InvocationTargetException");
369
370         utf_java_security_PrivilegedActionException =
371                 utf_new_char("java/security/PrivilegedActionException");
372  
373 #if defined(ENABLE_JAVASE)
374         utf_java_lang_Void             = utf_new_char("java/lang/Void");
375 #endif
376
377         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
378         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
379         utf_java_lang_Character        = utf_new_char("java/lang/Character");
380         utf_java_lang_Short            = utf_new_char("java/lang/Short");
381         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
382         utf_java_lang_Long             = utf_new_char("java/lang/Long");
383         utf_java_lang_Float            = utf_new_char("java/lang/Float");
384         utf_java_lang_Double           = utf_new_char("java/lang/Double");
385
386 #if defined(ENABLE_JAVASE)
387         utf_java_lang_StackTraceElement =
388                 utf_new_char("java/lang/StackTraceElement");
389
390         utf_java_lang_reflect_Constructor =
391                 utf_new_char("java/lang/reflect/Constructor");
392
393         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
394         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
395
396 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
397         utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
398         utf_java_lang_reflect_VMField       = utf_new_char("java/lang/reflect/VMField");
399         utf_java_lang_reflect_VMMethod      = utf_new_char("java/lang/reflect/VMMethod");
400 # endif
401
402         utf_java_util_Vector           = utf_new_char("java/util/Vector");
403 #endif
404
405         utf_InnerClasses               = utf_new_char("InnerClasses");
406         utf_ConstantValue              = utf_new_char("ConstantValue");
407         utf_Code                       = utf_new_char("Code");
408         utf_Exceptions                 = utf_new_char("Exceptions");
409         utf_LineNumberTable            = utf_new_char("LineNumberTable");
410         utf_SourceFile                 = utf_new_char("SourceFile");
411
412 #if defined(ENABLE_JAVASE)
413         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
414         utf_Signature                  = utf_new_char("Signature");
415         utf_StackMapTable              = utf_new_char("StackMapTable");
416
417 # if defined(ENABLE_ANNOTATIONS)
418         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
419         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
420         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
421         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
422         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
423 # endif
424 #endif
425
426         utf_init                           = utf_new_char("<init>");
427         utf_clinit                         = utf_new_char("<clinit>");
428         utf_clone                      = utf_new_char("clone");
429         utf_finalize                   = utf_new_char("finalize");
430         utf_invoke                     = utf_new_char("invoke");
431         utf_main                       = utf_new_char("main");
432         utf_run                        = utf_new_char("run");
433
434         utf_add                        = utf_new_char("add");
435         utf_dispatch                   = utf_new_char("dispatch");
436         utf_remove                     = utf_new_char("remove");
437         utf_addThread                  = utf_new_char("addThread");
438         utf_removeThread               = utf_new_char("removeThread");
439         utf_put                        = utf_new_char("put");
440         utf_get                        = utf_new_char("get");
441         utf_uncaughtException          = utf_new_char("uncaughtException");
442         utf_value                      = utf_new_char("value");
443
444         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
445         utf_findNative                 = utf_new_char("findNative");
446         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
447         utf_initCause                  = utf_new_char("initCause");
448         utf_loadClass                  = utf_new_char("loadClass");
449         utf_loadClassInternal          = utf_new_char("loadClassInternal");
450         utf_printStackTrace            = utf_new_char("printStackTrace");
451
452         utf_division_by_zero           = utf_new_char("/ by zero");
453
454         utf_Z                          = utf_new_char("Z");
455         utf_B                          = utf_new_char("B");
456         utf_C                          = utf_new_char("C");
457         utf_S                          = utf_new_char("S");
458         utf_I                          = utf_new_char("I");
459         utf_J                          = utf_new_char("J");
460         utf_F                          = utf_new_char("F");
461         utf_D                          = utf_new_char("D");
462
463         utf_void__void                 = utf_new_char("()V");
464         utf_boolean__void              = utf_new_char("(Z)V");
465         utf_byte__void                 = utf_new_char("(B)V");
466         utf_char__void                 = utf_new_char("(C)V");
467         utf_short__void                = utf_new_char("(S)V");
468         utf_int__void                  = utf_new_char("(I)V");
469         utf_long__void                 = utf_new_char("(J)V");
470         utf_float__void                = utf_new_char("(F)V");
471         utf_double__void               = utf_new_char("(D)V");
472         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
473         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
474
475         utf_void__java_lang_ClassLoader =
476                 utf_new_char("()Ljava/lang/ClassLoader;");
477
478         utf_java_lang_ClassLoader_java_lang_String__J =
479                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
480
481         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
482
483         utf_java_lang_Object__java_lang_Object =
484                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
485
486         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
487
488         utf_java_lang_String__java_lang_Class =
489                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
490
491         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
492
493         utf_java_lang_Thread_java_lang_Throwable__V =
494                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
495
496         utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
497                 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
498
499         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
500
501         utf_java_lang_Throwable__java_lang_Throwable =
502                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
503
504         utf_null                       = utf_new_char("null");
505         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
506         array_packagename              = utf_new_char("\t<the array package>");
507 }
508
509
510 /* utf_hashkey *****************************************************************
511
512    The hashkey is computed from the utf-text by using up to 8
513    characters.  For utf-symbols longer than 15 characters 3 characters
514    are taken from the beginning and the end, 2 characters are taken
515    from the middle.
516
517 *******************************************************************************/
518
519 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
520 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
521
522 u4 utf_hashkey(const char *text, u4 length)
523 {
524         const char *start_pos = text;       /* pointer to utf text                */
525         u4 a;
526
527         switch (length) {
528         case 0: /* empty string */
529                 return 0;
530
531         case 1: return fbs(0);
532         case 2: return fbs(0) ^ nbs(3);
533         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
534         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
535         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
536         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
537         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
538         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
539
540         case 9:
541                 a = fbs(0);
542                 a ^= nbs(1);
543                 a ^= nbs(2);
544                 text++;
545                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
546
547         case 10:
548                 a = fbs(0);
549                 text++;
550                 a ^= nbs(2);
551                 a ^= nbs(3);
552                 a ^= nbs(4);
553                 text++;
554                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
555
556         case 11:
557                 a = fbs(0);
558                 text++;
559                 a ^= nbs(2);
560                 a ^= nbs(3);
561                 a ^= nbs(4);
562                 text++;
563                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
564
565         case 12:
566                 a = fbs(0);
567                 text += 2;
568                 a ^= nbs(2);
569                 a ^= nbs(3);
570                 text++;
571                 a ^= nbs(5);
572                 a ^= nbs(6);
573                 a ^= nbs(7);
574                 text++;
575                 return a ^ nbs(9) ^ nbs(10);
576
577         case 13:
578                 a = fbs(0);
579                 a ^= nbs(1);
580                 text++;
581                 a ^= nbs(3);
582                 a ^= nbs(4);
583                 text += 2;      
584                 a ^= nbs(7);
585                 a ^= nbs(8);
586                 text += 2;
587                 return a ^ nbs(9) ^ nbs(10);
588
589         case 14:
590                 a = fbs(0);
591                 text += 2;      
592                 a ^= nbs(3);
593                 a ^= nbs(4);
594                 text += 2;      
595                 a ^= nbs(7);
596                 a ^= nbs(8);
597                 text += 2;
598                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
599
600         case 15:
601                 a = fbs(0);
602                 text += 2;      
603                 a ^= nbs(3);
604                 a ^= nbs(4);
605                 text += 2;      
606                 a ^= nbs(7);
607                 a ^= nbs(8);
608                 text += 2;
609                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
610
611         default:  /* 3 characters from beginning */
612                 a = fbs(0);
613                 text += 2;
614                 a ^= nbs(3);
615                 a ^= nbs(4);
616
617                 /* 2 characters from middle */
618                 text = start_pos + (length / 2);
619                 a ^= fbs(5);
620                 text += 2;
621                 a ^= nbs(6);    
622
623                 /* 3 characters from end */
624                 text = start_pos + length - 4;
625
626                 a ^= fbs(7);
627                 text++;
628
629                 return a ^ nbs(10) ^ nbs(11);
630     }
631 }
632
633 /* utf_full_hashkey ************************************************************
634
635    This function computes a hash value using all bytes in the string.
636
637    The algorithm is the "One-at-a-time" algorithm as published
638    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
639
640 *******************************************************************************/
641
642 u4 utf_full_hashkey(const char *text, u4 length)
643 {
644         register const unsigned char *p = (const unsigned char *) text;
645         register u4 hash;
646         register u4 i;
647
648         hash = 0;
649         for (i=length; i--;)
650         {
651             hash += *p++;
652             hash += (hash << 10);
653             hash ^= (hash >> 6);
654         }
655         hash += (hash << 3);
656         hash ^= (hash >> 11);
657         hash += (hash << 15);
658
659         return hash;
660 }
661
662 /* unicode_hashkey *************************************************************
663
664    Compute the hashkey of a unicode string.
665
666 *******************************************************************************/
667
668 u4 unicode_hashkey(u2 *text, u2 len)
669 {
670         return utf_hashkey((char *) text, len);
671 }
672
673
674 /* utf_new *********************************************************************
675
676    Creates a new utf-symbol, the text of the symbol is passed as a
677    u1-array. The function searches the utf-hashtable for a utf-symbol
678    with this text. On success the element returned, otherwise a new
679    hashtable element is created.
680
681    If the number of entries in the hashtable exceeds twice the size of
682    the hashtable slots a reorganization of the hashtable is done and
683    the utf symbols are copied to a new hashtable with doubled size.
684
685 *******************************************************************************/
686
687 utf *utf_new(const char *text, u2 length)
688 {
689         u4 key;                             /* hashkey computed from utf-text     */
690         u4 slot;                            /* slot in hashtable                  */
691         utf *u;                             /* hashtable element                  */
692         u2 i;
693
694         Mutex_lock(hashtable_utf->mutex);
695
696 #if defined(ENABLE_STATISTICS)
697         if (opt_stat)
698                 count_utf_new++;
699 #endif
700
701         key  = utf_hashkey(text, length);
702         slot = key & (hashtable_utf->size - 1);
703         u    = hashtable_utf->ptr[slot];
704
705         /* search external hash chain for utf-symbol */
706
707         while (u) {
708                 if (u->blength == length) {
709                         /* compare text of hashtable elements */
710
711                         for (i = 0; i < length; i++)
712                                 if (text[i] != u->text[i])
713                                         goto nomatch;
714                         
715 #if defined(ENABLE_STATISTICS)
716                         if (opt_stat)
717                                 count_utf_new_found++;
718 #endif
719
720                         /* symbol found in hashtable */
721
722                         Mutex_unlock(hashtable_utf->mutex);
723
724                         return u;
725                 }
726
727         nomatch:
728                 u = u->hashlink; /* next element in external chain */
729         }
730
731         /* location in hashtable found, create new utf element */
732
733         u = NEW(utf);
734
735         u->blength  = length;               /* length in bytes of utfstring       */
736         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
737         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
738
739         memcpy(u->text, text, length);      /* copy utf-text                      */
740         u->text[length] = '\0';
741
742 #if defined(ENABLE_STATISTICS)
743         if (opt_stat)
744                 count_utf_len += sizeof(utf) + length + 1;
745 #endif
746
747         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
748         hashtable_utf->entries++;           /* update number of entries           */
749
750         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
751
752         /* reorganization of hashtable, average length of the external
753            chains is approx. 2 */
754
755                 hashtable *newhash;                              /* the new hashtable */
756                 u4         i;
757                 utf       *u;
758                 utf       *nextu;
759                 u4         slot;
760
761                 /* create new hashtable, double the size */
762
763                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
764
765 #if defined(ENABLE_STATISTICS)
766                 if (opt_stat)
767                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
768 #endif
769
770                 /* transfer elements to new hashtable */
771
772                 for (i = 0; i < hashtable_utf->size; i++) {
773                         u = hashtable_utf->ptr[i];
774
775                         while (u) {
776                                 nextu = u->hashlink;
777                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
778                                                 
779                                 u->hashlink = (utf *) newhash->ptr[slot];
780                                 newhash->ptr[slot] = u;
781
782                                 /* follow link in external hash chain */
783
784                                 u = nextu;
785                         }
786                 }
787         
788                 /* dispose old table */
789
790                 hashtable_free(hashtable_utf);
791
792                 hashtable_utf = newhash;
793         }
794
795         Mutex_unlock(hashtable_utf->mutex);
796
797         return u;
798 }
799
800
801 /* utf_new_u2 ******************************************************************
802
803    Make utf symbol from u2 array, if isclassname is true '.' is
804    replaced by '/'.
805
806 *******************************************************************************/
807
808 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
809 {
810         char *buffer;                   /* memory buffer for  unicode characters  */
811         char *pos;                      /* pointer to current position in buffer  */
812         u4 left;                        /* unicode characters left                */
813         u4 buflength;                   /* utf length in bytes of the u2 array    */
814         utf *result;                    /* resulting utf-string                   */
815         int i;
816
817         /* determine utf length in bytes and allocate memory */
818
819         buflength = u2_utflength(unicode_pos, unicode_length); 
820         buffer    = MNEW(char, buflength);
821  
822         left = buflength;
823         pos  = buffer;
824
825         for (i = 0; i++ < unicode_length; unicode_pos++) {
826                 /* next unicode character */
827                 u2 c = *unicode_pos;
828                 
829                 if ((c != 0) && (c < 0x80)) {
830                         /* 1 character */       
831                         left--;
832                 if ((int) left < 0) break;
833                         /* convert classname */
834                         if (isclassname && c == '.')
835                                 *pos++ = '/';
836                         else
837                                 *pos++ = (char) c;
838
839                 } else if (c < 0x800) {             
840                         /* 2 characters */                              
841                 unsigned char high = c >> 6;
842                 unsigned char low  = c & 0x3F;
843                         left = left - 2;
844                 if ((int) left < 0) break;
845                 *pos++ = high | 0xC0; 
846                 *pos++ = low  | 0x80;     
847
848                 } else {         
849                 /* 3 characters */                              
850                 char low  = c & 0x3f;
851                 char mid  = (c >> 6) & 0x3F;
852                 char high = c >> 12;
853                         left = left - 3;
854                 if ((int) left < 0) break;
855                 *pos++ = high | 0xE0; 
856                 *pos++ = mid  | 0x80;  
857                 *pos++ = low  | 0x80;   
858                 }
859         }
860         
861         /* insert utf-string into symbol-table */
862         result = utf_new(buffer,buflength);
863
864         MFREE(buffer, char, buflength);
865
866         return result;
867 }
868
869
870 /* utf_new_char ****************************************************************
871
872    Creates a new utf symbol, the text for this symbol is passed as a
873    c-string ( = char* ).
874
875 *******************************************************************************/
876
877 utf *utf_new_char(const char *text)
878 {
879         return utf_new(text, strlen(text));
880 }
881
882
883 /* utf_new_char_classname ******************************************************
884
885    Creates a new utf symbol, the text for this symbol is passed as a
886    c-string ( = char* ) "." characters are going to be replaced by
887    "/". Since the above function is used often, this is a separte
888    function, instead of an if.
889
890 *******************************************************************************/
891
892 utf *utf_new_char_classname(const char *text)
893 {
894         if (strchr(text, '.')) {
895                 char *txt = strdup(text);
896                 char *end = txt + strlen(txt);
897                 char *c;
898                 utf *tmpRes;
899
900                 for (c = txt; c < end; c++)
901                         if (*c == '.') *c = '/';
902
903                 tmpRes = utf_new(txt, strlen(txt));
904                 FREE(txt, 0);
905
906                 return tmpRes;
907
908         } else
909                 return utf_new(text, strlen(text));
910 }
911
912
913 /* utf_nextu2 ******************************************************************
914
915    Read the next unicode character from the utf string and increment
916    the utf-string pointer accordingly.
917
918    CAUTION: This function is unsafe for input that was not checked 
919             by is_valid_utf!
920
921 *******************************************************************************/
922
923 u2 utf_nextu2(char **utf_ptr)
924 {
925     /* uncompressed unicode character */
926     u2 unicode_char = 0;
927     /* current position in utf text */  
928     unsigned char *utf = (unsigned char *) (*utf_ptr);
929     /* bytes representing the unicode character */
930     unsigned char ch1, ch2, ch3;
931     /* number of bytes used to represent the unicode character */
932     int len = 0;
933         
934     switch ((ch1 = utf[0]) >> 4) {
935         default: /* 1 byte */
936                 (*utf_ptr)++;
937                 return (u2) ch1;
938         case 0xC: 
939         case 0xD: /* 2 bytes */
940                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
941                         unsigned char high = ch1 & 0x1F;
942                         unsigned char low  = ch2 & 0x3F;
943                         unicode_char = (high << 6) + low;
944                         len = 2;
945                 }
946                 break;
947
948         case 0xE: /* 2 or 3 bytes */
949                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
950                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
951                                 unsigned char low  = ch3 & 0x3f;
952                                 unsigned char mid  = ch2 & 0x3f;
953                                 unsigned char high = ch1 & 0x0f;
954                                 unicode_char = (((high << 6) + mid) << 6) + low;
955                                 len = 3;
956                         } else
957                                 len = 2;                                           
958                 }
959                 break;
960     }
961
962     /* update position in utf-text */
963     *utf_ptr = (char *) (utf + len);
964
965     return unicode_char;
966 }
967
968
969 /* utf_bytes *******************************************************************
970
971    Determine number of bytes (aka. octets) in the utf string.
972
973    IN:
974       u............utf string
975
976    OUT:
977       The number of octets of this utf string.
978           There is _no_ terminating zero included in this count.
979
980 *******************************************************************************/
981
982 u4 utf_bytes(utf *u)
983 {
984         return u->blength;
985 }
986
987
988 /* utf_get_number_of_u2s_for_buffer ********************************************
989
990    Determine number of UTF-16 u2s in the given UTF-8 buffer
991
992    CAUTION: This function is unsafe for input that was not checked 
993             by is_valid_utf!
994
995    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
996    to an array of u2s (UTF-16) and want to know how many of them you will get.
997    All other uses of this function are probably wrong.
998
999    IN:
1000       buffer........points to first char in buffer
1001           blength.......number of _bytes_ in the buffer
1002
1003    OUT:
1004       the number of u2s needed to hold this string in UTF-16 encoding.
1005           There is _no_ terminating zero included in this count.
1006
1007    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1008    exception.
1009
1010 *******************************************************************************/
1011
1012 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1013 {
1014         const char *endpos;                 /* points behind utf string           */
1015         const char *utf_ptr;                /* current position in utf text       */
1016         u4 len = 0;                         /* number of unicode characters       */
1017
1018         utf_ptr = buffer;
1019         endpos = utf_ptr + blength;
1020
1021         while (utf_ptr < endpos) {
1022                 len++;
1023                 /* next unicode character */
1024                 utf_nextu2((char **)&utf_ptr);
1025         }
1026
1027         assert(utf_ptr == endpos);
1028
1029         return len;
1030 }
1031
1032
1033 /* utf_get_number_of_u2s *******************************************************
1034
1035    Determine number of UTF-16 u2s in the utf string.
1036
1037    CAUTION: This function is unsafe for input that was not checked 
1038             by is_valid_utf!
1039
1040    CAUTION: Use this function *only* when you want to convert a utf string
1041    to an array of u2s and want to know how many of them you will get.
1042    All other uses of this function are probably wrong.
1043
1044    IN:
1045       u............utf string
1046
1047    OUT:
1048       the number of u2s needed to hold this string in UTF-16 encoding.
1049           There is _no_ terminating zero included in this count.
1050           XXX 0 if a NullPointerException has been thrown (see below)
1051
1052 *******************************************************************************/
1053
1054 u4 utf_get_number_of_u2s(utf *u)
1055 {
1056         char *endpos;                       /* points behind utf string           */
1057         char *utf_ptr;                      /* current position in utf text       */
1058         u4 len = 0;                         /* number of unicode characters       */
1059
1060         /* XXX this is probably not checked by most callers! Review this after */
1061         /* the invalid uses of this function have been eliminated */
1062         if (u == NULL) {
1063                 exceptions_throw_nullpointerexception();
1064                 return 0;
1065         }
1066
1067         endpos = UTF_END(u);
1068         utf_ptr = u->text;
1069
1070         while (utf_ptr < endpos) {
1071                 len++;
1072                 /* next unicode character */
1073                 utf_nextu2(&utf_ptr);
1074         }
1075
1076         if (utf_ptr != endpos) {
1077                 /* string ended abruptly */
1078                 exceptions_throw_internalerror("Illegal utf8 string");
1079                 return 0;
1080         }
1081
1082         return len;
1083 }
1084
1085
1086 /* utf8_safe_number_of_u2s *****************************************************
1087
1088    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1089    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1090
1091    This function is safe even for invalid UTF-8 strings.
1092
1093    IN:
1094       text..........zero-terminated(!) UTF-8 string (may be invalid)
1095                         must NOT be NULL
1096           nbytes........strlen(text). (This is needed to completely emulate
1097                         the RI).
1098
1099    OUT:
1100       the number of u2s needed to hold this string in UTF-16 encoding.
1101           There is _no_ terminating zero included in this count.
1102
1103 *******************************************************************************/
1104
1105 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1106         register const unsigned char *t;
1107         register s4 byte;
1108         register s4 len;
1109         register const unsigned char *tlimit;
1110         s4 byte1;
1111         s4 byte2;
1112         s4 byte3;
1113         s4 value;
1114         s4 skip;
1115
1116         assert(text);
1117         assert(nbytes >= 0);
1118
1119         len = 0;
1120         t = (const unsigned char *) text;
1121         tlimit = t + nbytes;
1122
1123         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1124
1125         while (1) {
1126                 byte = *t++;
1127
1128                 if (byte & 0x80) {
1129                         /* highest bit set, non-ASCII character */
1130
1131                         if ((byte & 0xe0) == 0xc0) {
1132                                 /* 2-byte: should be 110..... 10...... ? */
1133
1134                                 if ((*t++ & 0xc0) == 0x80)
1135                                         ; /* valid 2-byte */
1136                                 else
1137                                         t--; /* invalid */
1138                         }
1139                         else if ((byte & 0xf0) == 0xe0) {
1140                                 /* 3-byte: should be 1110.... 10...... 10...... */
1141                                 /*                            ^t                */
1142
1143                                 if (t + 2 > tlimit)
1144                                         return len + 1; /* invalid, stop here */
1145
1146                                 if ((*t++ & 0xc0) == 0x80) {
1147                                         if ((*t++ & 0xc0) == 0x80)
1148                                                 ; /* valid 3-byte */
1149                                         else
1150                                                 t--; /* invalid */
1151                                 }
1152                                 else
1153                                         t--; /* invalid */
1154                         }
1155                         else if ((byte & 0xf8) == 0xf0) {
1156                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1157                                 /*                            ^t                         */
1158
1159                                 if (t + 3 > tlimit)
1160                                         return len + 1; /* invalid, stop here */
1161
1162                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1163                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1164                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1165                                                         /* valid 4-byte UTF-8? */
1166                                                         value = ((byte  & 0x07) << 18)
1167                                                                   | ((byte1 & 0x3f) << 12)
1168                                                                   | ((byte2 & 0x3f) <<  6)
1169                                                                   | ((byte3 & 0x3f)      );
1170
1171                                                         if (value > 0x10FFFF)
1172                                                                 ; /* invalid */
1173                                                         else if (value > 0xFFFF)
1174                                                                 len += 1; /* we need surrogates */
1175                                                         else
1176                                                                 ; /* 16bit suffice */
1177                                                 }
1178                                                 else
1179                                                         t--; /* invalid */
1180                                         }
1181                                         else
1182                                                 t--; /* invalid */
1183                                 }
1184                                 else
1185                                         t--; /* invalid */
1186                         }
1187                         else if ((byte & 0xfc) == 0xf8) {
1188                                 /* invalid 5-byte */
1189                                 if (t + 4 > tlimit)
1190                                         return len + 1; /* invalid, stop here */
1191
1192                                 skip = 4;
1193                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1194                                         t++;
1195                         }
1196                         else if ((byte & 0xfe) == 0xfc) {
1197                                 /* invalid 6-byte */
1198                                 if (t + 5 > tlimit)
1199                                         return len + 1; /* invalid, stop here */
1200
1201                                 skip = 5;
1202                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1203                                         t++;
1204                         }
1205                         else
1206                                 ; /* invalid */
1207                 }
1208                 else {
1209                         /* NUL */
1210
1211                         if (byte == 0)
1212                                 break;
1213
1214                         /* ASCII character, common case */
1215                 }
1216
1217                 len++;
1218         }
1219
1220         return len;
1221 }
1222
1223
1224 /* utf8_safe_convert_to_u2s ****************************************************
1225
1226    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1227    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1228    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1229
1230    This function is safe even for invalid UTF-8 strings.
1231
1232    IN:
1233       text..........zero-terminated(!) UTF-8 string (may be invalid)
1234                         must NOT be NULL
1235           nbytes........strlen(text). (This is needed to completely emulate
1236                                         the RI).
1237           buffer........a preallocated array of u2s to receive the decoded
1238                         string. Use utf8_safe_number_of_u2s to get the
1239                                         required number of u2s for allocating this.
1240
1241 *******************************************************************************/
1242
1243 #define UNICODE_REPLACEMENT  0xfffd
1244
1245 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1246         register const unsigned char *t;
1247         register s4 byte;
1248         register const unsigned char *tlimit;
1249         s4 byte1;
1250         s4 byte2;
1251         s4 byte3;
1252         s4 value;
1253         s4 skip;
1254
1255         assert(text);
1256         assert(nbytes >= 0);
1257
1258         t = (const unsigned char *) text;
1259         tlimit = t + nbytes;
1260
1261         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1262
1263         while (1) {
1264                 byte = *t++;
1265
1266                 if (byte & 0x80) {
1267                         /* highest bit set, non-ASCII character */
1268
1269                         if ((byte & 0xe0) == 0xc0) {
1270                                 /* 2-byte: should be 110..... 10...... */
1271
1272                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1273                                         /* valid 2-byte UTF-8 */
1274                                         *buffer++ = ((byte  & 0x1f) << 6)
1275                                                           | ((byte1 & 0x3f)     );
1276                                 }
1277                                 else {
1278                                         *buffer++ = UNICODE_REPLACEMENT;
1279                                         t--;
1280                                 }
1281                         }
1282                         else if ((byte & 0xf0) == 0xe0) {
1283                                 /* 3-byte: should be 1110.... 10...... 10...... */
1284
1285                                 if (t + 2 > tlimit) {
1286                                         *buffer++ = UNICODE_REPLACEMENT;
1287                                         return;
1288                                 }
1289
1290                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1291                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1292                                                 /* valid 3-byte UTF-8 */
1293                                                 *buffer++ = ((byte  & 0x0f) << 12)
1294                                                                   | ((byte1 & 0x3f) <<  6)
1295                                                                   | ((byte2 & 0x3f)      );
1296                                         }
1297                                         else {
1298                                                 *buffer++ = UNICODE_REPLACEMENT;
1299                                                 t--;
1300                                         }
1301                                 }
1302                                 else {
1303                                         *buffer++ = UNICODE_REPLACEMENT;
1304                                         t--;
1305                                 }
1306                         }
1307                         else if ((byte & 0xf8) == 0xf0) {
1308                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1309
1310                                 if (t + 3 > tlimit) {
1311                                         *buffer++ = UNICODE_REPLACEMENT;
1312                                         return;
1313                                 }
1314
1315                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1316                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1317                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1318                                                         /* valid 4-byte UTF-8? */
1319                                                         value = ((byte  & 0x07) << 18)
1320                                                                   | ((byte1 & 0x3f) << 12)
1321                                                                   | ((byte2 & 0x3f) <<  6)
1322                                                                   | ((byte3 & 0x3f)      );
1323
1324                                                         if (value > 0x10FFFF) {
1325                                                                 *buffer++ = UNICODE_REPLACEMENT;
1326                                                         }
1327                                                         else if (value > 0xFFFF) {
1328                                                                 /* we need surrogates */
1329                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1330                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1331                                                         }
1332                                                         else
1333                                                                 *buffer++ = value; /* 16bit suffice */
1334                                                 }
1335                                                 else {
1336                                                         *buffer++ = UNICODE_REPLACEMENT;
1337                                                         t--;
1338                                                 }
1339                                         }
1340                                         else {
1341                                                 *buffer++ = UNICODE_REPLACEMENT;
1342                                                 t--;
1343                                         }
1344                                 }
1345                                 else {
1346                                         *buffer++ = UNICODE_REPLACEMENT;
1347                                         t--;
1348                                 }
1349                         }
1350                         else if ((byte & 0xfc) == 0xf8) {
1351                                 if (t + 4 > tlimit) {
1352                                         *buffer++ = UNICODE_REPLACEMENT;
1353                                         return;
1354                                 }
1355
1356                                 skip = 4;
1357                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1358                                         t++;
1359                                 *buffer++ = UNICODE_REPLACEMENT;
1360                         }
1361                         else if ((byte & 0xfe) == 0xfc) {
1362                                 if (t + 5 > tlimit) {
1363                                         *buffer++ = UNICODE_REPLACEMENT;
1364                                         return;
1365                                 }
1366
1367                                 skip = 5;
1368                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1369                                         t++;
1370                                 *buffer++ = UNICODE_REPLACEMENT;
1371                         }
1372                         else
1373                                 *buffer++ = UNICODE_REPLACEMENT;
1374                 }
1375                 else {
1376                         /* NUL */
1377
1378                         if (byte == 0)
1379                                 break;
1380
1381                         /* ASCII character, common case */
1382
1383                         *buffer++ = byte;
1384                 }
1385         }
1386 }
1387
1388
1389 /* u2_utflength ****************************************************************
1390
1391    Returns the utf length in bytes of a u2 array.
1392
1393 *******************************************************************************/
1394
1395 u4 u2_utflength(u2 *text, u4 u2_length)
1396 {
1397         u4 result_len = 0;                  /* utf length in bytes                */
1398         u2 ch;                              /* current unicode character          */
1399         u4 len;
1400         
1401         for (len = 0; len < u2_length; len++) {
1402                 /* next unicode character */
1403                 ch = *text++;
1404           
1405                 /* determine bytes required to store unicode character as utf */
1406                 if (ch && (ch < 0x80)) 
1407                         result_len++;
1408                 else if (ch < 0x800)
1409                         result_len += 2;        
1410                 else 
1411                         result_len += 3;        
1412         }
1413
1414     return result_len;
1415 }
1416
1417
1418 /* utf_copy ********************************************************************
1419
1420    Copy the given utf string byte-for-byte to a buffer.
1421
1422    IN:
1423       buffer.......the buffer
1424           u............the utf string
1425
1426 *******************************************************************************/
1427
1428 void utf_copy(char *buffer, utf *u)
1429 {
1430         /* our utf strings are zero-terminated (done by utf_new) */
1431         MCOPY(buffer, u->text, char, u->blength + 1);
1432 }
1433
1434
1435 /* utf_cat *********************************************************************
1436
1437    Append the given utf string byte-for-byte to a buffer.
1438
1439    IN:
1440       buffer.......the buffer
1441           u............the utf string
1442
1443 *******************************************************************************/
1444
1445 void utf_cat(char *buffer, utf *u)
1446 {
1447         /* our utf strings are zero-terminated (done by utf_new) */
1448         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1449 }
1450
1451
1452 /* utf_copy_classname **********************************************************
1453
1454    Copy the given utf classname byte-for-byte to a buffer.
1455    '/' is replaced by '.'
1456
1457    IN:
1458       buffer.......the buffer
1459           u............the utf string
1460
1461 *******************************************************************************/
1462
1463 void utf_copy_classname(char *buffer, utf *u)
1464 {
1465         char *bufptr;
1466         char *srcptr;
1467         char *endptr;
1468         char ch;
1469
1470         bufptr = buffer;
1471         srcptr = u->text;
1472         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1473
1474         while (srcptr != endptr) {
1475                 ch = *srcptr++;
1476                 if (ch == '/')
1477                         ch = '.';
1478                 *bufptr++ = ch;
1479         }
1480 }
1481
1482
1483 /* utf_cat *********************************************************************
1484
1485    Append the given utf classname byte-for-byte to a buffer.
1486    '/' is replaced by '.'
1487
1488    IN:
1489       buffer.......the buffer
1490           u............the utf string
1491
1492 *******************************************************************************/
1493
1494 void utf_cat_classname(char *buffer, utf *u)
1495 {
1496         utf_copy_classname(buffer + strlen(buffer), u);
1497 }
1498
1499 /* utf_display_printable_ascii *************************************************
1500
1501    Write utf symbol to stdout (for debugging purposes).
1502    Non-printable and non-ASCII characters are printed as '?'.
1503
1504 *******************************************************************************/
1505
1506 void utf_display_printable_ascii(utf *u)
1507 {
1508         char *endpos;                       /* points behind utf string           */
1509         char *utf_ptr;                      /* current position in utf text       */
1510
1511         if (u == NULL) {
1512                 printf("NULL");
1513                 fflush(stdout);
1514                 return;
1515         }
1516
1517         endpos = UTF_END(u);
1518         utf_ptr = u->text;
1519
1520         while (utf_ptr < endpos) {
1521                 /* read next unicode character */
1522
1523                 u2 c = utf_nextu2(&utf_ptr);
1524
1525                 if ((c >= 32) && (c <= 127))
1526                         printf("%c", c);
1527                 else
1528                         printf("?");
1529         }
1530
1531         fflush(stdout);
1532 }
1533
1534
1535 /* utf_display_printable_ascii_classname ***************************************
1536
1537    Write utf symbol to stdout with `/' converted to `.' (for debugging
1538    purposes).
1539    Non-printable and non-ASCII characters are printed as '?'.
1540
1541 *******************************************************************************/
1542
1543 void utf_display_printable_ascii_classname(utf *u)
1544 {
1545         char *endpos;                       /* points behind utf string           */
1546         char *utf_ptr;                      /* current position in utf text       */
1547
1548         if (u == NULL) {
1549                 printf("NULL");
1550                 fflush(stdout);
1551                 return;
1552         }
1553
1554         endpos = UTF_END(u);
1555         utf_ptr = u->text;
1556
1557         while (utf_ptr < endpos) {
1558                 /* read next unicode character */
1559
1560                 u2 c = utf_nextu2(&utf_ptr);
1561
1562                 if (c == '/')
1563                         c = '.';
1564
1565                 if ((c >= 32) && (c <= 127))
1566                         printf("%c", c);
1567                 else
1568                         printf("?");
1569         }
1570
1571         fflush(stdout);
1572 }
1573
1574
1575 /* utf_sprint_convert_to_latin1 ************************************************
1576         
1577    Write utf symbol into c-string (for debugging purposes).
1578    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1579    invalid results.
1580
1581 *******************************************************************************/
1582
1583 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1584 {
1585         char *endpos;                       /* points behind utf string           */
1586         char *utf_ptr;                      /* current position in utf text       */
1587         u2 pos = 0;                         /* position in c-string               */
1588
1589         if (!u) {
1590                 strcpy(buffer, "NULL");
1591                 return;
1592         }
1593
1594         endpos = UTF_END(u);
1595         utf_ptr = u->text;
1596
1597         while (utf_ptr < endpos) 
1598                 /* copy next unicode character */       
1599                 buffer[pos++] = utf_nextu2(&utf_ptr);
1600
1601         /* terminate string */
1602         buffer[pos] = '\0';
1603 }
1604
1605
1606 /* utf_sprint_convert_to_latin1_classname **************************************
1607         
1608    Write utf symbol into c-string with `/' converted to `.' (for debugging
1609    purposes).
1610    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1611    invalid results.
1612
1613 *******************************************************************************/
1614
1615 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1616 {
1617         char *endpos;                       /* points behind utf string           */
1618         char *utf_ptr;                      /* current position in utf text       */
1619         u2 pos = 0;                         /* position in c-string               */
1620
1621         if (!u) {
1622                 strcpy(buffer, "NULL");
1623                 return;
1624         }
1625
1626         endpos = UTF_END(u);
1627         utf_ptr = u->text;
1628
1629         while (utf_ptr < endpos) {
1630                 /* copy next unicode character */       
1631                 u2 c = utf_nextu2(&utf_ptr);
1632                 if (c == '/') c = '.';
1633                 buffer[pos++] = c;
1634         }
1635
1636         /* terminate string */
1637         buffer[pos] = '\0';
1638 }
1639
1640
1641 /* utf_strcat_convert_to_latin1 ************************************************
1642         
1643    Like libc strcat, but uses an utf8 string.
1644    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1645    invalid results.
1646
1647 *******************************************************************************/
1648
1649 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1650 {
1651         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1652 }
1653
1654
1655 /* utf_strcat_convert_to_latin1_classname **************************************
1656         
1657    Like libc strcat, but uses an utf8 string.
1658    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1659    invalid results.
1660
1661 *******************************************************************************/
1662
1663 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1664 {
1665         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1666 }
1667
1668
1669 /* utf_fprint_printable_ascii **************************************************
1670         
1671    Write utf symbol into file.
1672    Non-printable and non-ASCII characters are printed as '?'.
1673
1674 *******************************************************************************/
1675
1676 void utf_fprint_printable_ascii(FILE *file, utf *u)
1677 {
1678         char *endpos;                       /* points behind utf string           */
1679         char *utf_ptr;                      /* current position in utf text       */
1680
1681         if (!u)
1682                 return;
1683
1684         endpos = UTF_END(u);
1685         utf_ptr = u->text;
1686
1687         while (utf_ptr < endpos) { 
1688                 /* read next unicode character */                
1689                 u2 c = utf_nextu2(&utf_ptr);                            
1690
1691                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1692                 else fprintf(file, "?");
1693         }
1694 }
1695
1696
1697 /* utf_fprint_printable_ascii_classname ****************************************
1698         
1699    Write utf symbol into file with `/' converted to `.'.
1700    Non-printable and non-ASCII characters are printed as '?'.
1701
1702 *******************************************************************************/
1703
1704 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1705 {
1706         char *endpos;                       /* points behind utf string           */
1707         char *utf_ptr;                      /* current position in utf text       */
1708
1709     if (!u)
1710                 return;
1711
1712         endpos = UTF_END(u);
1713         utf_ptr = u->text;
1714
1715         while (utf_ptr < endpos) { 
1716                 /* read next unicode character */                
1717                 u2 c = utf_nextu2(&utf_ptr);                            
1718                 if (c == '/') c = '.';
1719
1720                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1721                 else fprintf(file, "?");
1722         }
1723 }
1724
1725
1726 /* is_valid_utf ****************************************************************
1727
1728    Return true if the given string is a valid UTF-8 string.
1729
1730    utf_ptr...points to first character
1731    end_pos...points after last character
1732
1733 *******************************************************************************/
1734
1735 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1736
1737 bool is_valid_utf(char *utf_ptr, char *end_pos)
1738 {
1739         int bytes;
1740         int len,i;
1741         char c;
1742         unsigned long v;
1743
1744         if (end_pos < utf_ptr) return false;
1745         bytes = end_pos - utf_ptr;
1746         while (bytes--) {
1747                 c = *utf_ptr++;
1748
1749                 if (!c) return false;                     /* 0x00 is not allowed */
1750                 if ((c & 0x80) == 0) continue;            /* ASCII */
1751
1752                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1753                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1754                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1755                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1756                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1757                 else return false;                        /* invalid leading byte */
1758
1759                 if (len > 2) return false;                /* Java limitation */
1760
1761                 v = (unsigned long)c & (0x3f >> len);
1762                 
1763                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1764
1765                 for (i = len; i--; ) {
1766                         c = *utf_ptr++;
1767                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1768                                 return false;
1769                         v = (v << 6) | (c & 0x3f);
1770                 }
1771
1772                 if (v == 0) {
1773                         if (len != 1) return false;           /* Java special */
1774
1775                 } else {
1776                         /* Sun Java seems to allow overlong UTF-8 encodings */
1777                         
1778                         /* if (v < min_codepoint[len]) */
1779                                 /* XXX throw exception? */
1780                 }
1781
1782                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1783                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1784
1785                 /* even these seem to be allowed */
1786                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1787         }
1788
1789         return true;
1790 }
1791
1792
1793 /* is_valid_name ***************************************************************
1794
1795    Return true if the given string may be used as a class/field/method
1796    name. (Currently this only disallows empty strings and control
1797    characters.)
1798
1799    NOTE: The string is assumed to have passed is_valid_utf!
1800
1801    utf_ptr...points to first character
1802    end_pos...points after last character
1803
1804 *******************************************************************************/
1805
1806 bool is_valid_name(char *utf_ptr, char *end_pos)
1807 {
1808         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1809
1810         while (utf_ptr < end_pos) {
1811                 unsigned char c = *utf_ptr++;
1812
1813                 if (c < 0x20) return false; /* disallow control characters */
1814                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1815                         return false;
1816         }
1817
1818         return true;
1819 }
1820
1821 bool is_valid_name_utf(utf *u)
1822 {
1823         return is_valid_name(u->text, UTF_END(u));
1824 }
1825
1826
1827 /* utf_show ********************************************************************
1828
1829    Writes the utf symbols in the utfhash to stdout and displays the
1830    number of external hash chains grouped according to the chainlength
1831    (for debugging purposes).
1832
1833 *******************************************************************************/
1834
1835 #if !defined(NDEBUG)
1836 void utf_show(void)
1837 {
1838
1839 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1840
1841         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1842         u4 max_chainlength = 0;      /* maximum length of the chains */
1843         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1844         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1845         u4 i;
1846
1847         printf("UTF-HASH:\n");
1848
1849         /* show element of utf-hashtable */
1850
1851         for (i = 0; i < hashtable_utf->size; i++) {
1852                 utf *u = hashtable_utf->ptr[i];
1853
1854                 if (u) {
1855                         printf("SLOT %d: ", (int) i);
1856
1857                         while (u) {
1858                                 printf("'");
1859                                 utf_display_printable_ascii(u);
1860                                 printf("' ");
1861                                 u = u->hashlink;
1862                         }       
1863                         printf("\n");
1864                 }
1865         }
1866
1867         printf("UTF-HASH: %d slots for %d entries\n", 
1868                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1869
1870         if (hashtable_utf->entries == 0)
1871                 return;
1872
1873         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1874
1875         for (i=0;i<CHAIN_LIMIT;i++)
1876                 chain_count[i]=0;
1877
1878         /* count numbers of hashchains according to their length */
1879         for (i=0; i<hashtable_utf->size; i++) {
1880                   
1881                 utf *u = (utf*) hashtable_utf->ptr[i];
1882                 u4 chain_length = 0;
1883
1884                 /* determine chainlength */
1885                 while (u) {
1886                         u = u->hashlink;
1887                         chain_length++;
1888                 }
1889
1890                 /* update sum of all chainlengths */
1891                 sum_chainlength+=chain_length;
1892
1893                 /* determine the maximum length of the chains */
1894                 if (chain_length>max_chainlength)
1895                         max_chainlength = chain_length;
1896
1897                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1898                 if (chain_length>=CHAIN_LIMIT) {
1899                         beyond_limit+=chain_length;
1900                         chain_length=CHAIN_LIMIT-1;
1901                 }
1902
1903                 /* update number of hashchains of current length */
1904                 chain_count[chain_length]++;
1905         }
1906
1907         /* display results */  
1908         for (i=1;i<CHAIN_LIMIT-1;i++) 
1909                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1910           
1911         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1912
1913
1914         printf("max. chainlength:%5d\n",max_chainlength);
1915
1916         /* avg. chainlength = sum of chainlengths / number of chains */
1917         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1918 }
1919 #endif /* !defined(NDEBUG) */
1920
1921
1922 /*
1923  * These are local overrides for various environment variables in Emacs.
1924  * Please do not remove this and leave it at the end of the file, where
1925  * Emacs will automagically detect them.
1926  * ---------------------------------------------------------------------
1927  * Local variables:
1928  * mode: c
1929  * indent-tabs-mode: t
1930  * c-basic-offset: 4
1931  * tab-width: 4
1932  * End:
1933  * vim:noexpandtab:sw=4:ts=4:
1934  */