* Removed all Id tags.
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25 */
26
27
28 #include "config.h"
29
30 #include <string.h>
31 #include <assert.h>
32
33 #include "vm/types.h"
34
35 #include "mm/memory.h"
36
37 #include "threads/lock-common.h"
38
39 #include "toolbox/hashtable.h"
40
41 #include "vm/exceptions.h"
42
43 #include "vmcore/options.h"
44
45 #if defined(ENABLE_STATISTICS)
46 # include "vmcore/statistics.h"
47 #endif
48
49 #include "vmcore/utf8.h"
50
51
52 /* global variables ***********************************************************/
53
54 /* hashsize must be power of 2 */
55
56 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
57
58 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
59
60
61 /* utf-symbols for pointer comparison of frequently used strings **************/
62
63 utf *utf_java_lang_Object;
64
65 utf *utf_java_lang_Class;
66 utf *utf_java_lang_ClassLoader;
67 utf *utf_java_lang_Cloneable;
68 utf *utf_java_lang_SecurityManager;
69 utf *utf_java_lang_String;
70 utf *utf_java_lang_System;
71 utf *utf_java_lang_ThreadGroup;
72 utf *utf_java_lang_ref_SoftReference;
73 utf *utf_java_lang_ref_WeakReference;
74 utf *utf_java_lang_ref_PhantomReference;
75 utf *utf_java_io_Serializable;
76
77 utf *utf_java_lang_Throwable;
78 utf *utf_java_lang_Error;
79
80 utf *utf_java_lang_AbstractMethodError;
81 utf *utf_java_lang_ClassCircularityError;
82 utf *utf_java_lang_ClassFormatError;
83 utf *utf_java_lang_ExceptionInInitializerError;
84 utf *utf_java_lang_IncompatibleClassChangeError;
85 utf *utf_java_lang_InstantiationError;
86 utf *utf_java_lang_InternalError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoClassDefFoundError;
89 utf *utf_java_lang_NoSuchFieldError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
92 utf *utf_java_lang_UnsatisfiedLinkError;
93 utf *utf_java_lang_UnsupportedClassVersionError;
94 utf *utf_java_lang_VerifyError;
95 utf *utf_java_lang_VirtualMachineError;
96
97 #if defined(WITH_CLASSPATH_GNU)
98 utf *utf_java_lang_VMThrowable;
99 #endif
100
101 utf *utf_java_lang_Exception;
102
103 utf *utf_java_lang_ArithmeticException;
104 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
105 utf *utf_java_lang_ArrayStoreException;
106 utf *utf_java_lang_ClassCastException;
107 utf *utf_java_lang_ClassNotFoundException;
108 utf *utf_java_lang_CloneNotSupportedException;
109 utf *utf_java_lang_IllegalAccessException;
110 utf *utf_java_lang_IllegalArgumentException;
111 utf *utf_java_lang_IllegalMonitorStateException;
112 utf *utf_java_lang_InstantiationException;
113 utf *utf_java_lang_InterruptedException;
114 utf *utf_java_lang_NegativeArraySizeException;
115 utf *utf_java_lang_NullPointerException;
116 utf *utf_java_lang_StringIndexOutOfBoundsException;
117
118 utf *utf_java_lang_reflect_InvocationTargetException;
119
120 utf *utf_java_security_PrivilegedActionException;
121
122 #if defined(ENABLE_JAVASE)
123 utf* utf_java_lang_Void;
124 #endif
125
126 utf* utf_java_lang_Boolean;
127 utf* utf_java_lang_Byte;
128 utf* utf_java_lang_Character;
129 utf* utf_java_lang_Short;
130 utf* utf_java_lang_Integer;
131 utf* utf_java_lang_Long;
132 utf* utf_java_lang_Float;
133 utf* utf_java_lang_Double;
134
135 #if defined(ENABLE_JAVASE)
136 utf *utf_java_lang_StackTraceElement;
137 utf *utf_java_lang_reflect_Constructor;
138 utf *utf_java_lang_reflect_Field;
139 utf *utf_java_lang_reflect_Method;
140 utf *utf_java_util_Vector;
141 #endif
142
143 utf *utf_InnerClasses;                  /* InnerClasses                       */
144 utf *utf_ConstantValue;                 /* ConstantValue                      */
145 utf *utf_Code;                          /* Code                               */
146 utf *utf_Exceptions;                    /* Exceptions                         */
147 utf *utf_LineNumberTable;               /* LineNumberTable                    */
148 utf *utf_SourceFile;                    /* SourceFile                         */
149
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
152 utf *utf_Signature;
153 utf *utf_StackMapTable;
154
155 #if defined(ENABLE_ANNOTATIONS)
156 utf *utf_sun_reflect_ConstantPool;
157 #if defined(WITH_CLASSPATH_GNU)
158 utf *utf_sun_reflect_annotation_AnnotationParser;
159 #endif
160
161 utf *utf_RuntimeVisibleAnnotations;
162 utf *utf_RuntimeInvisibleAnnotations;
163 utf *utf_RuntimeVisibleParameterAnnotations;
164 utf *utf_RuntimeInvisibleParameterAnnotations;
165 utf *utf_AnnotationDefault;
166 #endif
167 #endif
168
169 utf *utf_init;                          /* <init>                             */
170 utf *utf_clinit;                        /* <clinit>                           */
171 utf *utf_clone;                         /* clone                              */
172 utf *utf_finalize;                      /* finalize                           */
173 utf *utf_run;                           /* run                                */
174
175 utf *utf_add;
176 utf *utf_remove;
177 utf *utf_addThread;
178 utf *utf_removeThread;
179 utf *utf_put;
180 utf *utf_get;
181 utf *utf_uncaughtException;
182 utf *utf_value;
183
184 utf *utf_fillInStackTrace;
185 utf *utf_findNative;
186 utf *utf_getSystemClassLoader;
187 utf *utf_initCause;
188 utf *utf_loadClass;
189 utf *utf_printStackTrace;
190
191 utf *utf_division_by_zero;
192
193 utf *utf_Z;                             /* Z                                  */
194 utf *utf_B;                             /* B                                  */
195 utf *utf_C;                             /* C                                  */
196 utf *utf_S;                             /* S                                  */
197 utf *utf_I;                             /* I                                  */
198 utf *utf_J;                             /* J                                  */
199 utf *utf_F;                             /* F                                  */
200 utf *utf_D;                             /* D                                  */
201
202 utf *utf_void__void;                    /* ()V                                */
203 utf *utf_boolean__void;                 /* (Z)V                               */
204 utf *utf_byte__void;                    /* (B)V                               */
205 utf *utf_char__void;                    /* (C)V                               */
206 utf *utf_short__void;                   /* (S)V                               */
207 utf *utf_int__void;                     /* (I)V                               */
208 utf *utf_long__void;                    /* (J)V                               */
209 utf *utf_float__void;                   /* (F)V                               */
210 utf *utf_double__void;                  /* (D)V                               */
211
212 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
213 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
214 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
215 utf *utf_java_lang_ClassLoader_java_lang_String__J;
216 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
217 utf *utf_java_lang_Object__java_lang_Object;
218 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
219 utf *utf_java_lang_String__java_lang_Class;
220 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
221 utf *utf_java_lang_Thread_java_lang_Throwable__V;
222 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
223 utf *utf_java_lang_Throwable__java_lang_Throwable;
224
225 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
226 utf *utf_null;
227 utf *array_packagename;
228
229
230 /* utf_init ********************************************************************
231
232    Initializes the utf8 subsystem.
233
234 *******************************************************************************/
235
236 bool utf8_init(void)
237 {
238         /* create utf8 hashtable */
239
240         hashtable_utf = NEW(hashtable);
241
242         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
243
244 #if defined(ENABLE_STATISTICS)
245         if (opt_stat)
246                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
247 #endif
248
249         /* create utf-symbols for pointer comparison of frequently used strings */
250
251         utf_java_lang_Object           = utf_new_char("java/lang/Object");
252
253         utf_java_lang_Class            = utf_new_char("java/lang/Class");
254         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
255         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
256         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
257         utf_java_lang_String           = utf_new_char("java/lang/String");
258         utf_java_lang_System           = utf_new_char("java/lang/System");
259         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
260
261         utf_java_lang_ref_SoftReference =
262                 utf_new_char("java/lang/ref/SoftReference");
263
264         utf_java_lang_ref_WeakReference =
265                 utf_new_char("java/lang/ref/WeakReference");
266
267         utf_java_lang_ref_PhantomReference =
268                 utf_new_char("java/lang/ref/PhantomReference");
269
270         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
271
272         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
273         utf_java_lang_Error            = utf_new_char("java/lang/Error");
274
275         utf_java_lang_ClassCircularityError =
276                 utf_new_char("java/lang/ClassCircularityError");
277
278         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
279
280         utf_java_lang_ExceptionInInitializerError =
281                 utf_new_char("java/lang/ExceptionInInitializerError");
282
283         utf_java_lang_IncompatibleClassChangeError =
284                 utf_new_char("java/lang/IncompatibleClassChangeError");
285
286         utf_java_lang_InstantiationError =
287                 utf_new_char("java/lang/InstantiationError");
288
289         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
290         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
291
292         utf_java_lang_NoClassDefFoundError =
293                 utf_new_char("java/lang/NoClassDefFoundError");
294
295         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
296
297         utf_java_lang_UnsatisfiedLinkError =
298                 utf_new_char("java/lang/UnsatisfiedLinkError");
299
300         utf_java_lang_UnsupportedClassVersionError =
301                 utf_new_char("java/lang/UnsupportedClassVersionError");
302
303         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
304
305         utf_java_lang_VirtualMachineError =
306                 utf_new_char("java/lang/VirtualMachineError");
307
308 #if defined(ENABLE_JAVASE)
309         utf_java_lang_AbstractMethodError =
310                 utf_new_char("java/lang/AbstractMethodError");
311
312         utf_java_lang_NoSuchFieldError =
313                 utf_new_char("java/lang/NoSuchFieldError");
314
315         utf_java_lang_NoSuchMethodError =
316                 utf_new_char("java/lang/NoSuchMethodError");
317 #endif
318
319 #if defined(WITH_CLASSPATH_GNU)
320         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
321 #endif
322
323         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
324
325         utf_java_lang_ArithmeticException =
326                 utf_new_char("java/lang/ArithmeticException");
327
328         utf_java_lang_ArrayIndexOutOfBoundsException =
329                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
330
331         utf_java_lang_ArrayStoreException =
332                 utf_new_char("java/lang/ArrayStoreException");
333
334         utf_java_lang_ClassCastException =
335                 utf_new_char("java/lang/ClassCastException");
336
337         utf_java_lang_ClassNotFoundException =
338                 utf_new_char("java/lang/ClassNotFoundException");
339
340         utf_java_lang_CloneNotSupportedException =
341                 utf_new_char("java/lang/CloneNotSupportedException");
342
343         utf_java_lang_IllegalAccessException =
344                 utf_new_char("java/lang/IllegalAccessException");
345
346         utf_java_lang_IllegalArgumentException =
347                 utf_new_char("java/lang/IllegalArgumentException");
348
349         utf_java_lang_IllegalMonitorStateException =
350                 utf_new_char("java/lang/IllegalMonitorStateException");
351
352         utf_java_lang_InstantiationException =
353                 utf_new_char("java/lang/InstantiationException");
354
355         utf_java_lang_InterruptedException =
356                 utf_new_char("java/lang/InterruptedException");
357
358         utf_java_lang_NegativeArraySizeException =
359                 utf_new_char("java/lang/NegativeArraySizeException");
360
361         utf_java_lang_NullPointerException =
362                 utf_new_char("java/lang/NullPointerException");
363
364         utf_java_lang_StringIndexOutOfBoundsException =
365                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
366
367         utf_java_lang_reflect_InvocationTargetException =
368                 utf_new_char("java/lang/reflect/InvocationTargetException");
369
370         utf_java_security_PrivilegedActionException =
371                 utf_new_char("java/security/PrivilegedActionException");
372  
373 #if defined(ENABLE_JAVASE)
374         utf_java_lang_Void             = utf_new_char("java/lang/Void");
375 #endif
376
377         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
378         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
379         utf_java_lang_Character        = utf_new_char("java/lang/Character");
380         utf_java_lang_Short            = utf_new_char("java/lang/Short");
381         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
382         utf_java_lang_Long             = utf_new_char("java/lang/Long");
383         utf_java_lang_Float            = utf_new_char("java/lang/Float");
384         utf_java_lang_Double           = utf_new_char("java/lang/Double");
385
386 #if defined(ENABLE_JAVASE)
387         utf_java_lang_StackTraceElement =
388                 utf_new_char("java/lang/StackTraceElement");
389
390         utf_java_lang_reflect_Constructor =
391                 utf_new_char("java/lang/reflect/Constructor");
392
393         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
394         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
395         utf_java_util_Vector           = utf_new_char("java/util/Vector");
396 #endif
397
398         utf_InnerClasses               = utf_new_char("InnerClasses");
399         utf_ConstantValue              = utf_new_char("ConstantValue");
400         utf_Code                       = utf_new_char("Code");
401         utf_Exceptions                 = utf_new_char("Exceptions");
402         utf_LineNumberTable            = utf_new_char("LineNumberTable");
403         utf_SourceFile                 = utf_new_char("SourceFile");
404
405 #if defined(ENABLE_JAVASE)
406         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
407         utf_Signature                  = utf_new_char("Signature");
408         utf_StackMapTable              = utf_new_char("StackMapTable");
409
410 #if defined(ENABLE_ANNOTATIONS)
411         utf_sun_reflect_ConstantPool                = utf_new_char("sun/reflect/ConstantPool");
412 #if defined(WITH_CLASSPATH_GNU)
413         utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
414 #endif
415
416         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
417         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
418         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
419         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
420         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
421 #endif
422 #endif
423
424         utf_init                           = utf_new_char("<init>");
425         utf_clinit                         = utf_new_char("<clinit>");
426         utf_clone                      = utf_new_char("clone");
427         utf_finalize                   = utf_new_char("finalize");
428         utf_run                        = utf_new_char("run");
429
430         utf_add                        = utf_new_char("add");
431         utf_remove                     = utf_new_char("remove");
432         utf_addThread                  = utf_new_char("addThread");
433         utf_removeThread               = utf_new_char("removeThread");
434         utf_put                        = utf_new_char("put");
435         utf_get                        = utf_new_char("get");
436         utf_uncaughtException          = utf_new_char("uncaughtException");
437         utf_value                      = utf_new_char("value");
438
439         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
440         utf_findNative                 = utf_new_char("findNative");
441         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
442         utf_initCause                  = utf_new_char("initCause");
443         utf_loadClass                  = utf_new_char("loadClass");
444         utf_printStackTrace            = utf_new_char("printStackTrace");
445
446         utf_division_by_zero           = utf_new_char("/ by zero");
447
448         utf_Z                          = utf_new_char("Z");
449         utf_B                          = utf_new_char("B");
450         utf_C                          = utf_new_char("C");
451         utf_S                          = utf_new_char("S");
452         utf_I                          = utf_new_char("I");
453         utf_J                          = utf_new_char("J");
454         utf_F                          = utf_new_char("F");
455         utf_D                          = utf_new_char("D");
456
457         utf_void__void                 = utf_new_char("()V");
458         utf_boolean__void              = utf_new_char("(Z)V");
459         utf_byte__void                 = utf_new_char("(B)V");
460         utf_char__void                 = utf_new_char("(C)V");
461         utf_short__void                = utf_new_char("(S)V");
462         utf_int__void                  = utf_new_char("(I)V");
463         utf_long__void                 = utf_new_char("(J)V");
464         utf_float__void                = utf_new_char("(F)V");
465         utf_double__void               = utf_new_char("(D)V");
466         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
467         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
468
469         utf_void__java_lang_ClassLoader =
470                 utf_new_char("()Ljava/lang/ClassLoader;");
471
472         utf_java_lang_ClassLoader_java_lang_String__J =
473                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
474
475         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
476
477         utf_java_lang_Object__java_lang_Object =
478                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
479
480         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
481
482         utf_java_lang_String__java_lang_Class =
483                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
484
485         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
486
487         utf_java_lang_Thread_java_lang_Throwable__V =
488                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
489
490         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
491
492         utf_java_lang_Throwable__java_lang_Throwable =
493                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
494
495         utf_null                       = utf_new_char("null");
496         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
497         array_packagename              = utf_new_char("\t<the array package>");
498
499         /* everything's ok */
500
501         return true;
502 }
503
504
505 /* utf_hashkey *****************************************************************
506
507    The hashkey is computed from the utf-text by using up to 8
508    characters.  For utf-symbols longer than 15 characters 3 characters
509    are taken from the beginning and the end, 2 characters are taken
510    from the middle.
511
512 *******************************************************************************/
513
514 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
515 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
516
517 u4 utf_hashkey(const char *text, u4 length)
518 {
519         const char *start_pos = text;       /* pointer to utf text                */
520         u4 a;
521
522         switch (length) {
523         case 0: /* empty string */
524                 return 0;
525
526         case 1: return fbs(0);
527         case 2: return fbs(0) ^ nbs(3);
528         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
529         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
530         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
531         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
532         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
533         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
534
535         case 9:
536                 a = fbs(0);
537                 a ^= nbs(1);
538                 a ^= nbs(2);
539                 text++;
540                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
541
542         case 10:
543                 a = fbs(0);
544                 text++;
545                 a ^= nbs(2);
546                 a ^= nbs(3);
547                 a ^= nbs(4);
548                 text++;
549                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
550
551         case 11:
552                 a = fbs(0);
553                 text++;
554                 a ^= nbs(2);
555                 a ^= nbs(3);
556                 a ^= nbs(4);
557                 text++;
558                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
559
560         case 12:
561                 a = fbs(0);
562                 text += 2;
563                 a ^= nbs(2);
564                 a ^= nbs(3);
565                 text++;
566                 a ^= nbs(5);
567                 a ^= nbs(6);
568                 a ^= nbs(7);
569                 text++;
570                 return a ^ nbs(9) ^ nbs(10);
571
572         case 13:
573                 a = fbs(0);
574                 a ^= nbs(1);
575                 text++;
576                 a ^= nbs(3);
577                 a ^= nbs(4);
578                 text += 2;      
579                 a ^= nbs(7);
580                 a ^= nbs(8);
581                 text += 2;
582                 return a ^ nbs(9) ^ nbs(10);
583
584         case 14:
585                 a = fbs(0);
586                 text += 2;      
587                 a ^= nbs(3);
588                 a ^= nbs(4);
589                 text += 2;      
590                 a ^= nbs(7);
591                 a ^= nbs(8);
592                 text += 2;
593                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
594
595         case 15:
596                 a = fbs(0);
597                 text += 2;      
598                 a ^= nbs(3);
599                 a ^= nbs(4);
600                 text += 2;      
601                 a ^= nbs(7);
602                 a ^= nbs(8);
603                 text += 2;
604                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
605
606         default:  /* 3 characters from beginning */
607                 a = fbs(0);
608                 text += 2;
609                 a ^= nbs(3);
610                 a ^= nbs(4);
611
612                 /* 2 characters from middle */
613                 text = start_pos + (length / 2);
614                 a ^= fbs(5);
615                 text += 2;
616                 a ^= nbs(6);    
617
618                 /* 3 characters from end */
619                 text = start_pos + length - 4;
620
621                 a ^= fbs(7);
622                 text++;
623
624                 return a ^ nbs(10) ^ nbs(11);
625     }
626 }
627
628 /* utf_full_hashkey ************************************************************
629
630    This function computes a hash value using all bytes in the string.
631
632    The algorithm is the "One-at-a-time" algorithm as published
633    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
634
635 *******************************************************************************/
636
637 u4 utf_full_hashkey(const char *text, u4 length)
638 {
639         register const unsigned char *p = (const unsigned char *) text;
640         register u4 hash;
641         register u4 i;
642
643         hash = 0;
644         for (i=length; i--;)
645         {
646             hash += *p++;
647             hash += (hash << 10);
648             hash ^= (hash >> 6);
649         }
650         hash += (hash << 3);
651         hash ^= (hash >> 11);
652         hash += (hash << 15);
653
654         return hash;
655 }
656
657 /* unicode_hashkey *************************************************************
658
659    Compute the hashkey of a unicode string.
660
661 *******************************************************************************/
662
663 u4 unicode_hashkey(u2 *text, u2 len)
664 {
665         return utf_hashkey((char *) text, len);
666 }
667
668
669 /* utf_new *********************************************************************
670
671    Creates a new utf-symbol, the text of the symbol is passed as a
672    u1-array. The function searches the utf-hashtable for a utf-symbol
673    with this text. On success the element returned, otherwise a new
674    hashtable element is created.
675
676    If the number of entries in the hashtable exceeds twice the size of
677    the hashtable slots a reorganization of the hashtable is done and
678    the utf symbols are copied to a new hashtable with doubled size.
679
680 *******************************************************************************/
681
682 utf *utf_new(const char *text, u2 length)
683 {
684         u4 key;                             /* hashkey computed from utf-text     */
685         u4 slot;                            /* slot in hashtable                  */
686         utf *u;                             /* hashtable element                  */
687         u2 i;
688
689         LOCK_MONITOR_ENTER(hashtable_utf->header);
690
691 #if defined(ENABLE_STATISTICS)
692         if (opt_stat)
693                 count_utf_new++;
694 #endif
695
696         key  = utf_hashkey(text, length);
697         slot = key & (hashtable_utf->size - 1);
698         u    = hashtable_utf->ptr[slot];
699
700         /* search external hash chain for utf-symbol */
701
702         while (u) {
703                 if (u->blength == length) {
704                         /* compare text of hashtable elements */
705
706                         for (i = 0; i < length; i++)
707                                 if (text[i] != u->text[i])
708                                         goto nomatch;
709                         
710 #if defined(ENABLE_STATISTICS)
711                         if (opt_stat)
712                                 count_utf_new_found++;
713 #endif
714
715                         /* symbol found in hashtable */
716
717                         LOCK_MONITOR_EXIT(hashtable_utf->header);
718
719                         return u;
720                 }
721
722         nomatch:
723                 u = u->hashlink; /* next element in external chain */
724         }
725
726         /* location in hashtable found, create new utf element */
727
728         u = NEW(utf);
729
730         u->blength  = length;               /* length in bytes of utfstring       */
731         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
732         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
733
734         memcpy(u->text, text, length);      /* copy utf-text                      */
735         u->text[length] = '\0';
736
737 #if defined(ENABLE_STATISTICS)
738         if (opt_stat)
739                 count_utf_len += sizeof(utf) + length + 1;
740 #endif
741
742         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
743         hashtable_utf->entries++;           /* update number of entries           */
744
745         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
746
747         /* reorganization of hashtable, average length of the external
748            chains is approx. 2 */
749
750                 hashtable *newhash;                              /* the new hashtable */
751                 u4         i;
752                 utf       *u;
753                 utf       *nextu;
754                 u4         slot;
755
756                 /* create new hashtable, double the size */
757
758                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
759
760 #if defined(ENABLE_STATISTICS)
761                 if (opt_stat)
762                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
763 #endif
764
765                 /* transfer elements to new hashtable */
766
767                 for (i = 0; i < hashtable_utf->size; i++) {
768                         u = hashtable_utf->ptr[i];
769
770                         while (u) {
771                                 nextu = u->hashlink;
772                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
773                                                 
774                                 u->hashlink = (utf *) newhash->ptr[slot];
775                                 newhash->ptr[slot] = u;
776
777                                 /* follow link in external hash chain */
778
779                                 u = nextu;
780                         }
781                 }
782         
783                 /* dispose old table */
784
785                 hashtable_free(hashtable_utf);
786
787                 hashtable_utf = newhash;
788         }
789
790         LOCK_MONITOR_EXIT(hashtable_utf->header);
791
792         return u;
793 }
794
795
796 /* utf_new_u2 ******************************************************************
797
798    Make utf symbol from u2 array, if isclassname is true '.' is
799    replaced by '/'.
800
801 *******************************************************************************/
802
803 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
804 {
805         char *buffer;                   /* memory buffer for  unicode characters  */
806         char *pos;                      /* pointer to current position in buffer  */
807         u4 left;                        /* unicode characters left                */
808         u4 buflength;                   /* utf length in bytes of the u2 array    */
809         utf *result;                    /* resulting utf-string                   */
810         int i;          
811
812         /* determine utf length in bytes and allocate memory */
813
814         buflength = u2_utflength(unicode_pos, unicode_length); 
815         buffer    = MNEW(char, buflength);
816  
817         left = buflength;
818         pos  = buffer;
819
820         for (i = 0; i++ < unicode_length; unicode_pos++) {
821                 /* next unicode character */
822                 u2 c = *unicode_pos;
823                 
824                 if ((c != 0) && (c < 0x80)) {
825                         /* 1 character */       
826                         left--;
827                 if ((int) left < 0) break;
828                         /* convert classname */
829                         if (isclassname && c == '.')
830                                 *pos++ = '/';
831                         else
832                                 *pos++ = (char) c;
833
834                 } else if (c < 0x800) {             
835                         /* 2 characters */                              
836                 unsigned char high = c >> 6;
837                 unsigned char low  = c & 0x3F;
838                         left = left - 2;
839                 if ((int) left < 0) break;
840                 *pos++ = high | 0xC0; 
841                 *pos++ = low  | 0x80;     
842
843                 } else {         
844                 /* 3 characters */                              
845                 char low  = c & 0x3f;
846                 char mid  = (c >> 6) & 0x3F;
847                 char high = c >> 12;
848                         left = left - 3;
849                 if ((int) left < 0) break;
850                 *pos++ = high | 0xE0; 
851                 *pos++ = mid  | 0x80;  
852                 *pos++ = low  | 0x80;   
853                 }
854         }
855         
856         /* insert utf-string into symbol-table */
857         result = utf_new(buffer,buflength);
858
859         MFREE(buffer, char, buflength);
860
861         return result;
862 }
863
864
865 /* utf_new_char ****************************************************************
866
867    Creates a new utf symbol, the text for this symbol is passed as a
868    c-string ( = char* ).
869
870 *******************************************************************************/
871
872 utf *utf_new_char(const char *text)
873 {
874         return utf_new(text, strlen(text));
875 }
876
877
878 /* utf_new_char_classname ******************************************************
879
880    Creates a new utf symbol, the text for this symbol is passed as a
881    c-string ( = char* ) "." characters are going to be replaced by
882    "/". Since the above function is used often, this is a separte
883    function, instead of an if.
884
885 *******************************************************************************/
886
887 utf *utf_new_char_classname(const char *text)
888 {
889         if (strchr(text, '.')) {
890                 char *txt = strdup(text);
891                 char *end = txt + strlen(txt);
892                 char *c;
893                 utf *tmpRes;
894
895                 for (c = txt; c < end; c++)
896                         if (*c == '.') *c = '/';
897
898                 tmpRes = utf_new(txt, strlen(txt));
899                 FREE(txt, 0);
900
901                 return tmpRes;
902
903         } else
904                 return utf_new(text, strlen(text));
905 }
906
907
908 /* utf_nextu2 ******************************************************************
909
910    Read the next unicode character from the utf string and increment
911    the utf-string pointer accordingly.
912
913    CAUTION: This function is unsafe for input that was not checked 
914             by is_valid_utf!
915
916 *******************************************************************************/
917
918 u2 utf_nextu2(char **utf_ptr)
919 {
920     /* uncompressed unicode character */
921     u2 unicode_char = 0;
922     /* current position in utf text */  
923     unsigned char *utf = (unsigned char *) (*utf_ptr);
924     /* bytes representing the unicode character */
925     unsigned char ch1, ch2, ch3;
926     /* number of bytes used to represent the unicode character */
927     int len = 0;
928         
929     switch ((ch1 = utf[0]) >> 4) {
930         default: /* 1 byte */
931                 (*utf_ptr)++;
932                 return (u2) ch1;
933         case 0xC: 
934         case 0xD: /* 2 bytes */
935                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
936                         unsigned char high = ch1 & 0x1F;
937                         unsigned char low  = ch2 & 0x3F;
938                         unicode_char = (high << 6) + low;
939                         len = 2;
940                 }
941                 break;
942
943         case 0xE: /* 2 or 3 bytes */
944                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
945                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
946                                 unsigned char low  = ch3 & 0x3f;
947                                 unsigned char mid  = ch2 & 0x3f;
948                                 unsigned char high = ch1 & 0x0f;
949                                 unicode_char = (((high << 6) + mid) << 6) + low;
950                                 len = 3;
951                         } else
952                                 len = 2;                                           
953                 }
954                 break;
955     }
956
957     /* update position in utf-text */
958     *utf_ptr = (char *) (utf + len);
959
960     return unicode_char;
961 }
962
963
964 /* utf_bytes *******************************************************************
965
966    Determine number of bytes (aka. octets) in the utf string.
967
968    IN:
969       u............utf string
970
971    OUT:
972       The number of octets of this utf string.
973           There is _no_ terminating zero included in this count.
974
975 *******************************************************************************/
976
977 u4 utf_bytes(utf *u)
978 {
979         return u->blength;
980 }
981
982
983 /* utf_get_number_of_u2s_for_buffer ********************************************
984
985    Determine number of UTF-16 u2s in the given UTF-8 buffer
986
987    CAUTION: This function is unsafe for input that was not checked 
988             by is_valid_utf!
989
990    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
991    to an array of u2s (UTF-16) and want to know how many of them you will get.
992    All other uses of this function are probably wrong.
993
994    IN:
995       buffer........points to first char in buffer
996           blength.......number of _bytes_ in the buffer
997
998    OUT:
999       the number of u2s needed to hold this string in UTF-16 encoding.
1000           There is _no_ terminating zero included in this count.
1001
1002    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1003    exception.
1004
1005 *******************************************************************************/
1006
1007 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1008 {
1009         const char *endpos;                 /* points behind utf string           */
1010         const char *utf_ptr;                /* current position in utf text       */
1011         u4 len = 0;                         /* number of unicode characters       */
1012
1013         utf_ptr = buffer;
1014         endpos = utf_ptr + blength;
1015
1016         while (utf_ptr < endpos) {
1017                 len++;
1018                 /* next unicode character */
1019                 utf_nextu2((char **)&utf_ptr);
1020         }
1021
1022         assert(utf_ptr == endpos);
1023
1024         return len;
1025 }
1026
1027
1028 /* utf_get_number_of_u2s *******************************************************
1029
1030    Determine number of UTF-16 u2s in the utf string.
1031
1032    CAUTION: This function is unsafe for input that was not checked 
1033             by is_valid_utf!
1034
1035    CAUTION: Use this function *only* when you want to convert a utf string
1036    to an array of u2s and want to know how many of them you will get.
1037    All other uses of this function are probably wrong.
1038
1039    IN:
1040       u............utf string
1041
1042    OUT:
1043       the number of u2s needed to hold this string in UTF-16 encoding.
1044           There is _no_ terminating zero included in this count.
1045           XXX 0 if a NullPointerException has been thrown (see below)
1046
1047 *******************************************************************************/
1048
1049 u4 utf_get_number_of_u2s(utf *u)
1050 {
1051         char *endpos;                       /* points behind utf string           */
1052         char *utf_ptr;                      /* current position in utf text       */
1053         u4 len = 0;                         /* number of unicode characters       */
1054
1055         /* XXX this is probably not checked by most callers! Review this after */
1056         /* the invalid uses of this function have been eliminated */
1057         if (u == NULL) {
1058                 exceptions_throw_nullpointerexception();
1059                 return 0;
1060         }
1061
1062         endpos = UTF_END(u);
1063         utf_ptr = u->text;
1064
1065         while (utf_ptr < endpos) {
1066                 len++;
1067                 /* next unicode character */
1068                 utf_nextu2(&utf_ptr);
1069         }
1070
1071         if (utf_ptr != endpos) {
1072                 /* string ended abruptly */
1073                 exceptions_throw_internalerror("Illegal utf8 string");
1074                 return 0;
1075         }
1076
1077         return len;
1078 }
1079
1080
1081 /* utf8_safe_number_of_u2s *****************************************************
1082
1083    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1084    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1085
1086    This function is safe even for invalid UTF-8 strings.
1087
1088    IN:
1089       text..........zero-terminated(!) UTF-8 string (may be invalid)
1090                         must NOT be NULL
1091           nbytes........strlen(text). (This is needed to completely emulate
1092                         the RI).
1093
1094    OUT:
1095       the number of u2s needed to hold this string in UTF-16 encoding.
1096           There is _no_ terminating zero included in this count.
1097
1098 *******************************************************************************/
1099
1100 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1101         register const unsigned char *t;
1102         register s4 byte;
1103         register s4 len;
1104         register const unsigned char *tlimit;
1105         s4 byte1;
1106         s4 byte2;
1107         s4 byte3;
1108         s4 value;
1109         s4 skip;
1110
1111         assert(text);
1112         assert(nbytes >= 0);
1113
1114         len = 0;
1115         t = (const unsigned char *) text;
1116         tlimit = t + nbytes;
1117
1118         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1119
1120         while (1) {
1121                 byte = *t++;
1122
1123                 if (byte & 0x80) {
1124                         /* highest bit set, non-ASCII character */
1125
1126                         if ((byte & 0xe0) == 0xc0) {
1127                                 /* 2-byte: should be 110..... 10...... ? */
1128
1129                                 if ((*t++ & 0xc0) == 0x80)
1130                                         ; /* valid 2-byte */
1131                                 else
1132                                         t--; /* invalid */
1133                         }
1134                         else if ((byte & 0xf0) == 0xe0) {
1135                                 /* 3-byte: should be 1110.... 10...... 10...... */
1136                                 /*                            ^t                */
1137
1138                                 if (t + 2 > tlimit)
1139                                         return len + 1; /* invalid, stop here */
1140
1141                                 if ((*t++ & 0xc0) == 0x80) {
1142                                         if ((*t++ & 0xc0) == 0x80)
1143                                                 ; /* valid 3-byte */
1144                                         else
1145                                                 t--; /* invalid */
1146                                 }
1147                                 else
1148                                         t--; /* invalid */
1149                         }
1150                         else if ((byte & 0xf8) == 0xf0) {
1151                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1152                                 /*                            ^t                         */
1153
1154                                 if (t + 3 > tlimit)
1155                                         return len + 1; /* invalid, stop here */
1156
1157                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1158                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1159                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1160                                                         /* valid 4-byte UTF-8? */
1161                                                         value = ((byte  & 0x07) << 18)
1162                                                                   | ((byte1 & 0x3f) << 12)
1163                                                                   | ((byte2 & 0x3f) <<  6)
1164                                                                   | ((byte3 & 0x3f)      );
1165
1166                                                         if (value > 0x10FFFF)
1167                                                                 ; /* invalid */
1168                                                         else if (value > 0xFFFF)
1169                                                                 len += 1; /* we need surrogates */
1170                                                         else
1171                                                                 ; /* 16bit suffice */
1172                                                 }
1173                                                 else
1174                                                         t--; /* invalid */
1175                                         }
1176                                         else
1177                                                 t--; /* invalid */
1178                                 }
1179                                 else
1180                                         t--; /* invalid */
1181                         }
1182                         else if ((byte & 0xfc) == 0xf8) {
1183                                 /* invalid 5-byte */
1184                                 if (t + 4 > tlimit)
1185                                         return len + 1; /* invalid, stop here */
1186
1187                                 skip = 4;
1188                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1189                                         t++;
1190                         }
1191                         else if ((byte & 0xfe) == 0xfc) {
1192                                 /* invalid 6-byte */
1193                                 if (t + 5 > tlimit)
1194                                         return len + 1; /* invalid, stop here */
1195
1196                                 skip = 5;
1197                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1198                                         t++;
1199                         }
1200                         else
1201                                 ; /* invalid */
1202                 }
1203                 else {
1204                         /* NUL */
1205
1206                         if (byte == 0)
1207                                 break;
1208
1209                         /* ASCII character, common case */
1210                 }
1211
1212                 len++;
1213         }
1214
1215         return len;
1216 }
1217
1218
1219 /* utf8_safe_convert_to_u2s ****************************************************
1220
1221    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1222    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1223    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1224
1225    This function is safe even for invalid UTF-8 strings.
1226
1227    IN:
1228       text..........zero-terminated(!) UTF-8 string (may be invalid)
1229                         must NOT be NULL
1230           nbytes........strlen(text). (This is needed to completely emulate
1231                                         the RI).
1232           buffer........a preallocated array of u2s to receive the decoded
1233                         string. Use utf8_safe_number_of_u2s to get the
1234                                         required number of u2s for allocating this.
1235
1236 *******************************************************************************/
1237
1238 #define UNICODE_REPLACEMENT  0xfffd
1239
1240 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1241         register const unsigned char *t;
1242         register s4 byte;
1243         register const unsigned char *tlimit;
1244         s4 byte1;
1245         s4 byte2;
1246         s4 byte3;
1247         s4 value;
1248         s4 skip;
1249
1250         assert(text);
1251         assert(nbytes >= 0);
1252
1253         t = (const unsigned char *) text;
1254         tlimit = t + nbytes;
1255
1256         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1257
1258         while (1) {
1259                 byte = *t++;
1260
1261                 if (byte & 0x80) {
1262                         /* highest bit set, non-ASCII character */
1263
1264                         if ((byte & 0xe0) == 0xc0) {
1265                                 /* 2-byte: should be 110..... 10...... */
1266
1267                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1268                                         /* valid 2-byte UTF-8 */
1269                                         *buffer++ = ((byte  & 0x1f) << 6)
1270                                                           | ((byte1 & 0x3f)     );
1271                                 }
1272                                 else {
1273                                         *buffer++ = UNICODE_REPLACEMENT;
1274                                         t--;
1275                                 }
1276                         }
1277                         else if ((byte & 0xf0) == 0xe0) {
1278                                 /* 3-byte: should be 1110.... 10...... 10...... */
1279
1280                                 if (t + 2 > tlimit) {
1281                                         *buffer++ = UNICODE_REPLACEMENT;
1282                                         return;
1283                                 }
1284
1285                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1286                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1287                                                 /* valid 3-byte UTF-8 */
1288                                                 *buffer++ = ((byte  & 0x0f) << 12)
1289                                                                   | ((byte1 & 0x3f) <<  6)
1290                                                                   | ((byte2 & 0x3f)      );
1291                                         }
1292                                         else {
1293                                                 *buffer++ = UNICODE_REPLACEMENT;
1294                                                 t--;
1295                                         }
1296                                 }
1297                                 else {
1298                                         *buffer++ = UNICODE_REPLACEMENT;
1299                                         t--;
1300                                 }
1301                         }
1302                         else if ((byte & 0xf8) == 0xf0) {
1303                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1304
1305                                 if (t + 3 > tlimit) {
1306                                         *buffer++ = UNICODE_REPLACEMENT;
1307                                         return;
1308                                 }
1309
1310                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1311                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1312                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1313                                                         /* valid 4-byte UTF-8? */
1314                                                         value = ((byte  & 0x07) << 18)
1315                                                                   | ((byte1 & 0x3f) << 12)
1316                                                                   | ((byte2 & 0x3f) <<  6)
1317                                                                   | ((byte3 & 0x3f)      );
1318
1319                                                         if (value > 0x10FFFF) {
1320                                                                 *buffer++ = UNICODE_REPLACEMENT;
1321                                                         }
1322                                                         else if (value > 0xFFFF) {
1323                                                                 /* we need surrogates */
1324                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1325                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1326                                                         }
1327                                                         else
1328                                                                 *buffer++ = value; /* 16bit suffice */
1329                                                 }
1330                                                 else {
1331                                                         *buffer++ = UNICODE_REPLACEMENT;
1332                                                         t--;
1333                                                 }
1334                                         }
1335                                         else {
1336                                                 *buffer++ = UNICODE_REPLACEMENT;
1337                                                 t--;
1338                                         }
1339                                 }
1340                                 else {
1341                                         *buffer++ = UNICODE_REPLACEMENT;
1342                                         t--;
1343                                 }
1344                         }
1345                         else if ((byte & 0xfc) == 0xf8) {
1346                                 if (t + 4 > tlimit) {
1347                                         *buffer++ = UNICODE_REPLACEMENT;
1348                                         return;
1349                                 }
1350
1351                                 skip = 4;
1352                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1353                                         t++;
1354                                 *buffer++ = UNICODE_REPLACEMENT;
1355                         }
1356                         else if ((byte & 0xfe) == 0xfc) {
1357                                 if (t + 5 > tlimit) {
1358                                         *buffer++ = UNICODE_REPLACEMENT;
1359                                         return;
1360                                 }
1361
1362                                 skip = 5;
1363                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1364                                         t++;
1365                                 *buffer++ = UNICODE_REPLACEMENT;
1366                         }
1367                         else
1368                                 *buffer++ = UNICODE_REPLACEMENT;
1369                 }
1370                 else {
1371                         /* NUL */
1372
1373                         if (byte == 0)
1374                                 break;
1375
1376                         /* ASCII character, common case */
1377
1378                         *buffer++ = byte;
1379                 }
1380         }
1381 }
1382
1383
1384 /* u2_utflength ****************************************************************
1385
1386    Returns the utf length in bytes of a u2 array.
1387
1388 *******************************************************************************/
1389
1390 u4 u2_utflength(u2 *text, u4 u2_length)
1391 {
1392         u4 result_len = 0;                  /* utf length in bytes                */
1393         u2 ch;                              /* current unicode character          */
1394         u4 len;
1395         
1396         for (len = 0; len < u2_length; len++) {
1397                 /* next unicode character */
1398                 ch = *text++;
1399           
1400                 /* determine bytes required to store unicode character as utf */
1401                 if (ch && (ch < 0x80)) 
1402                         result_len++;
1403                 else if (ch < 0x800)
1404                         result_len += 2;        
1405                 else 
1406                         result_len += 3;        
1407         }
1408
1409     return result_len;
1410 }
1411
1412
1413 /* utf_copy ********************************************************************
1414
1415    Copy the given utf string byte-for-byte to a buffer.
1416
1417    IN:
1418       buffer.......the buffer
1419           u............the utf string
1420
1421 *******************************************************************************/
1422
1423 void utf_copy(char *buffer, utf *u)
1424 {
1425         /* our utf strings are zero-terminated (done by utf_new) */
1426         MCOPY(buffer, u->text, char, u->blength + 1);
1427 }
1428
1429
1430 /* utf_cat *********************************************************************
1431
1432    Append the given utf string byte-for-byte to a buffer.
1433
1434    IN:
1435       buffer.......the buffer
1436           u............the utf string
1437
1438 *******************************************************************************/
1439
1440 void utf_cat(char *buffer, utf *u)
1441 {
1442         /* our utf strings are zero-terminated (done by utf_new) */
1443         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1444 }
1445
1446
1447 /* utf_copy_classname **********************************************************
1448
1449    Copy the given utf classname byte-for-byte to a buffer.
1450    '/' is replaced by '.'
1451
1452    IN:
1453       buffer.......the buffer
1454           u............the utf string
1455
1456 *******************************************************************************/
1457
1458 void utf_copy_classname(char *buffer, utf *u)
1459 {
1460         char *bufptr;
1461         char *srcptr;
1462         char *endptr;
1463         char ch;
1464
1465         bufptr = buffer;
1466         srcptr = u->text;
1467         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1468
1469         while (srcptr != endptr) {
1470                 ch = *srcptr++;
1471                 if (ch == '/')
1472                         ch = '.';
1473                 *bufptr++ = ch;
1474         }
1475 }
1476
1477
1478 /* utf_cat *********************************************************************
1479
1480    Append the given utf classname byte-for-byte to a buffer.
1481    '/' is replaced by '.'
1482
1483    IN:
1484       buffer.......the buffer
1485           u............the utf string
1486
1487 *******************************************************************************/
1488
1489 void utf_cat_classname(char *buffer, utf *u)
1490 {
1491         utf_copy_classname(buffer + strlen(buffer), u);
1492 }
1493
1494 /* utf_display_printable_ascii *************************************************
1495
1496    Write utf symbol to stdout (for debugging purposes).
1497    Non-printable and non-ASCII characters are printed as '?'.
1498
1499 *******************************************************************************/
1500
1501 void utf_display_printable_ascii(utf *u)
1502 {
1503         char *endpos;                       /* points behind utf string           */
1504         char *utf_ptr;                      /* current position in utf text       */
1505
1506         if (u == NULL) {
1507                 printf("NULL");
1508                 fflush(stdout);
1509                 return;
1510         }
1511
1512         endpos = UTF_END(u);
1513         utf_ptr = u->text;
1514
1515         while (utf_ptr < endpos) {
1516                 /* read next unicode character */
1517
1518                 u2 c = utf_nextu2(&utf_ptr);
1519
1520                 if ((c >= 32) && (c <= 127))
1521                         printf("%c", c);
1522                 else
1523                         printf("?");
1524         }
1525
1526         fflush(stdout);
1527 }
1528
1529
1530 /* utf_display_printable_ascii_classname ***************************************
1531
1532    Write utf symbol to stdout with `/' converted to `.' (for debugging
1533    purposes).
1534    Non-printable and non-ASCII characters are printed as '?'.
1535
1536 *******************************************************************************/
1537
1538 void utf_display_printable_ascii_classname(utf *u)
1539 {
1540         char *endpos;                       /* points behind utf string           */
1541         char *utf_ptr;                      /* current position in utf text       */
1542
1543         if (u == NULL) {
1544                 printf("NULL");
1545                 fflush(stdout);
1546                 return;
1547         }
1548
1549         endpos = UTF_END(u);
1550         utf_ptr = u->text;
1551
1552         while (utf_ptr < endpos) {
1553                 /* read next unicode character */
1554
1555                 u2 c = utf_nextu2(&utf_ptr);
1556
1557                 if (c == '/')
1558                         c = '.';
1559
1560                 if ((c >= 32) && (c <= 127))
1561                         printf("%c", c);
1562                 else
1563                         printf("?");
1564         }
1565
1566         fflush(stdout);
1567 }
1568
1569
1570 /* utf_sprint_convert_to_latin1 ************************************************
1571         
1572    Write utf symbol into c-string (for debugging purposes).
1573    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1574    invalid results.
1575
1576 *******************************************************************************/
1577
1578 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1579 {
1580         char *endpos;                       /* points behind utf string           */
1581         char *utf_ptr;                      /* current position in utf text       */
1582         u2 pos = 0;                         /* position in c-string               */
1583
1584         if (!u) {
1585                 strcpy(buffer, "NULL");
1586                 return;
1587         }
1588
1589         endpos = UTF_END(u);
1590         utf_ptr = u->text;
1591
1592         while (utf_ptr < endpos) 
1593                 /* copy next unicode character */       
1594                 buffer[pos++] = utf_nextu2(&utf_ptr);
1595
1596         /* terminate string */
1597         buffer[pos] = '\0';
1598 }
1599
1600
1601 /* utf_sprint_convert_to_latin1_classname **************************************
1602         
1603    Write utf symbol into c-string with `/' converted to `.' (for debugging
1604    purposes).
1605    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1606    invalid results.
1607
1608 *******************************************************************************/
1609
1610 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1611 {
1612         char *endpos;                       /* points behind utf string           */
1613         char *utf_ptr;                      /* current position in utf text       */
1614         u2 pos = 0;                         /* position in c-string               */
1615
1616         if (!u) {
1617                 strcpy(buffer, "NULL");
1618                 return;
1619         }
1620
1621         endpos = UTF_END(u);
1622         utf_ptr = u->text;
1623
1624         while (utf_ptr < endpos) {
1625                 /* copy next unicode character */       
1626                 u2 c = utf_nextu2(&utf_ptr);
1627                 if (c == '/') c = '.';
1628                 buffer[pos++] = c;
1629         }
1630
1631         /* terminate string */
1632         buffer[pos] = '\0';
1633 }
1634
1635
1636 /* utf_strcat_convert_to_latin1 ************************************************
1637         
1638    Like libc strcat, but uses an utf8 string.
1639    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1640    invalid results.
1641
1642 *******************************************************************************/
1643
1644 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1645 {
1646         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1647 }
1648
1649
1650 /* utf_strcat_convert_to_latin1_classname **************************************
1651         
1652    Like libc strcat, but uses an utf8 string.
1653    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1654    invalid results.
1655
1656 *******************************************************************************/
1657
1658 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1659 {
1660         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1661 }
1662
1663
1664 /* utf_fprint_printable_ascii **************************************************
1665         
1666    Write utf symbol into file.
1667    Non-printable and non-ASCII characters are printed as '?'.
1668
1669 *******************************************************************************/
1670
1671 void utf_fprint_printable_ascii(FILE *file, utf *u)
1672 {
1673         char *endpos;                       /* points behind utf string           */
1674         char *utf_ptr;                      /* current position in utf text       */
1675
1676         if (!u)
1677                 return;
1678
1679         endpos = UTF_END(u);
1680         utf_ptr = u->text;
1681
1682         while (utf_ptr < endpos) { 
1683                 /* read next unicode character */                
1684                 u2 c = utf_nextu2(&utf_ptr);                            
1685
1686                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1687                 else fprintf(file, "?");
1688         }
1689 }
1690
1691
1692 /* utf_fprint_printable_ascii_classname ****************************************
1693         
1694    Write utf symbol into file with `/' converted to `.'.
1695    Non-printable and non-ASCII characters are printed as '?'.
1696
1697 *******************************************************************************/
1698
1699 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1700 {
1701         char *endpos;                       /* points behind utf string           */
1702         char *utf_ptr;                      /* current position in utf text       */
1703
1704     if (!u)
1705                 return;
1706
1707         endpos = UTF_END(u);
1708         utf_ptr = u->text;
1709
1710         while (utf_ptr < endpos) { 
1711                 /* read next unicode character */                
1712                 u2 c = utf_nextu2(&utf_ptr);                            
1713                 if (c == '/') c = '.';
1714
1715                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1716                 else fprintf(file, "?");
1717         }
1718 }
1719
1720
1721 /* is_valid_utf ****************************************************************
1722
1723    Return true if the given string is a valid UTF-8 string.
1724
1725    utf_ptr...points to first character
1726    end_pos...points after last character
1727
1728 *******************************************************************************/
1729
1730 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1731
1732 bool is_valid_utf(char *utf_ptr, char *end_pos)
1733 {
1734         int bytes;
1735         int len,i;
1736         char c;
1737         unsigned long v;
1738
1739         if (end_pos < utf_ptr) return false;
1740         bytes = end_pos - utf_ptr;
1741         while (bytes--) {
1742                 c = *utf_ptr++;
1743
1744                 if (!c) return false;                     /* 0x00 is not allowed */
1745                 if ((c & 0x80) == 0) continue;            /* ASCII */
1746
1747                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1748                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1749                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1750                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1751                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1752                 else return false;                        /* invalid leading byte */
1753
1754                 if (len > 2) return false;                /* Java limitation */
1755
1756                 v = (unsigned long)c & (0x3f >> len);
1757                 
1758                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1759
1760                 for (i = len; i--; ) {
1761                         c = *utf_ptr++;
1762                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1763                                 return false;
1764                         v = (v << 6) | (c & 0x3f);
1765                 }
1766
1767                 if (v == 0) {
1768                         if (len != 1) return false;           /* Java special */
1769
1770                 } else {
1771                         /* Sun Java seems to allow overlong UTF-8 encodings */
1772                         
1773                         /* if (v < min_codepoint[len]) */
1774                                 /* XXX throw exception? */
1775                 }
1776
1777                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1778                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1779
1780                 /* even these seem to be allowed */
1781                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1782         }
1783
1784         return true;
1785 }
1786
1787
1788 /* is_valid_name ***************************************************************
1789
1790    Return true if the given string may be used as a class/field/method
1791    name. (Currently this only disallows empty strings and control
1792    characters.)
1793
1794    NOTE: The string is assumed to have passed is_valid_utf!
1795
1796    utf_ptr...points to first character
1797    end_pos...points after last character
1798
1799 *******************************************************************************/
1800
1801 bool is_valid_name(char *utf_ptr, char *end_pos)
1802 {
1803         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1804
1805         while (utf_ptr < end_pos) {
1806                 unsigned char c = *utf_ptr++;
1807
1808                 if (c < 0x20) return false; /* disallow control characters */
1809                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1810                         return false;
1811         }
1812
1813         return true;
1814 }
1815
1816 bool is_valid_name_utf(utf *u)
1817 {
1818         return is_valid_name(u->text, UTF_END(u));
1819 }
1820
1821
1822 /* utf_show ********************************************************************
1823
1824    Writes the utf symbols in the utfhash to stdout and displays the
1825    number of external hash chains grouped according to the chainlength
1826    (for debugging purposes).
1827
1828 *******************************************************************************/
1829
1830 #if !defined(NDEBUG)
1831 void utf_show(void)
1832 {
1833
1834 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1835
1836         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1837         u4 max_chainlength = 0;      /* maximum length of the chains */
1838         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1839         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1840         u4 i;
1841
1842         printf("UTF-HASH:\n");
1843
1844         /* show element of utf-hashtable */
1845
1846         for (i = 0; i < hashtable_utf->size; i++) {
1847                 utf *u = hashtable_utf->ptr[i];
1848
1849                 if (u) {
1850                         printf("SLOT %d: ", (int) i);
1851
1852                         while (u) {
1853                                 printf("'");
1854                                 utf_display_printable_ascii(u);
1855                                 printf("' ");
1856                                 u = u->hashlink;
1857                         }       
1858                         printf("\n");
1859                 }
1860         }
1861
1862         printf("UTF-HASH: %d slots for %d entries\n", 
1863                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1864
1865         if (hashtable_utf->entries == 0)
1866                 return;
1867
1868         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1869
1870         for (i=0;i<CHAIN_LIMIT;i++)
1871                 chain_count[i]=0;
1872
1873         /* count numbers of hashchains according to their length */
1874         for (i=0; i<hashtable_utf->size; i++) {
1875                   
1876                 utf *u = (utf*) hashtable_utf->ptr[i];
1877                 u4 chain_length = 0;
1878
1879                 /* determine chainlength */
1880                 while (u) {
1881                         u = u->hashlink;
1882                         chain_length++;
1883                 }
1884
1885                 /* update sum of all chainlengths */
1886                 sum_chainlength+=chain_length;
1887
1888                 /* determine the maximum length of the chains */
1889                 if (chain_length>max_chainlength)
1890                         max_chainlength = chain_length;
1891
1892                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1893                 if (chain_length>=CHAIN_LIMIT) {
1894                         beyond_limit+=chain_length;
1895                         chain_length=CHAIN_LIMIT-1;
1896                 }
1897
1898                 /* update number of hashchains of current length */
1899                 chain_count[chain_length]++;
1900         }
1901
1902         /* display results */  
1903         for (i=1;i<CHAIN_LIMIT-1;i++) 
1904                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1905           
1906         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1907
1908
1909         printf("max. chainlength:%5d\n",max_chainlength);
1910
1911         /* avg. chainlength = sum of chainlengths / number of chains */
1912         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1913 }
1914 #endif /* !defined(NDEBUG) */
1915
1916
1917 /*
1918  * These are local overrides for various environment variables in Emacs.
1919  * Please do not remove this and leave it at the end of the file, where
1920  * Emacs will automagically detect them.
1921  * ---------------------------------------------------------------------
1922  * Local variables:
1923  * mode: c
1924  * indent-tabs-mode: t
1925  * c-basic-offset: 4
1926  * tab-width: 4
1927  * End:
1928  * vim:noexpandtab:sw=4:ts=4:
1929  */