9cb1f1109dbb7d292c32b3f63efb6d1cdf7e2abd
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007, 2008
4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
5
6    This file is part of CACAO.
7
8    This program is free software; you can redistribute it and/or
9    modify it under the terms of the GNU General Public License as
10    published by the Free Software Foundation; either version 2, or (at
11    your option) any later version.
12
13    This program is distributed in the hope that it will be useful, but
14    WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16    General Public License for more details.
17
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21    02110-1301, USA.
22
23 */
24
25
26 #include "config.h"
27
28 #include <string.h>
29 #include <assert.h>
30
31 #include "vm/types.h"
32
33 #include "mm/memory.h"
34
35 #include "threads/lock-common.h"
36
37 #include "toolbox/hashtable.h"
38
39 #include "vm/exceptions.h"
40
41 #include "vmcore/options.h"
42
43 #if defined(ENABLE_STATISTICS)
44 # include "vmcore/statistics.h"
45 #endif
46
47 #include "vmcore/utf8.h"
48
49
50 /* global variables ***********************************************************/
51
52 /* hashsize must be power of 2 */
53
54 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
55
56 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
57
58
59 /* utf-symbols for pointer comparison of frequently used strings **************/
60
61 utf *utf_java_lang_Object;
62
63 utf *utf_java_lang_Class;
64 utf *utf_java_lang_ClassLoader;
65 utf *utf_java_lang_Cloneable;
66 utf *utf_java_lang_SecurityManager;
67 utf *utf_java_lang_String;
68 utf *utf_java_lang_ThreadGroup;
69 utf *utf_java_lang_ref_SoftReference;
70 utf *utf_java_lang_ref_WeakReference;
71 utf *utf_java_lang_ref_PhantomReference;
72 utf *utf_java_io_Serializable;
73
74 utf *utf_java_lang_Throwable;
75 utf *utf_java_lang_Error;
76
77 utf *utf_java_lang_AbstractMethodError;
78 utf *utf_java_lang_ClassCircularityError;
79 utf *utf_java_lang_ClassFormatError;
80 utf *utf_java_lang_ExceptionInInitializerError;
81 utf *utf_java_lang_IncompatibleClassChangeError;
82 utf *utf_java_lang_InstantiationError;
83 utf *utf_java_lang_InternalError;
84 utf *utf_java_lang_LinkageError;
85 utf *utf_java_lang_NoClassDefFoundError;
86 utf *utf_java_lang_NoSuchFieldError;
87 utf *utf_java_lang_NoSuchMethodError;
88 utf *utf_java_lang_OutOfMemoryError;
89 utf *utf_java_lang_UnsatisfiedLinkError;
90 utf *utf_java_lang_UnsupportedClassVersionError;
91 utf *utf_java_lang_VerifyError;
92 utf *utf_java_lang_VirtualMachineError;
93
94 utf *utf_java_lang_Exception;
95
96 utf *utf_java_lang_ArithmeticException;
97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
98 utf *utf_java_lang_ArrayStoreException;
99 utf *utf_java_lang_ClassCastException;
100 utf *utf_java_lang_ClassNotFoundException;
101 utf *utf_java_lang_CloneNotSupportedException;
102 utf *utf_java_lang_IllegalAccessException;
103 utf *utf_java_lang_IllegalArgumentException;
104 utf *utf_java_lang_IllegalMonitorStateException;
105 utf *utf_java_lang_InstantiationException;
106 utf *utf_java_lang_InterruptedException;
107 utf *utf_java_lang_NegativeArraySizeException;
108 utf *utf_java_lang_NullPointerException;
109 utf *utf_java_lang_RuntimeException;
110 utf *utf_java_lang_StringIndexOutOfBoundsException;
111
112 utf *utf_java_lang_reflect_InvocationTargetException;
113
114 utf *utf_java_security_PrivilegedActionException;
115
116 #if defined(ENABLE_JAVASE)
117 utf* utf_java_lang_Void;
118 #endif
119
120 utf* utf_java_lang_Boolean;
121 utf* utf_java_lang_Byte;
122 utf* utf_java_lang_Character;
123 utf* utf_java_lang_Short;
124 utf* utf_java_lang_Integer;
125 utf* utf_java_lang_Long;
126 utf* utf_java_lang_Float;
127 utf* utf_java_lang_Double;
128
129 #if defined(ENABLE_JAVASE)
130 utf *utf_java_lang_StackTraceElement;
131 utf *utf_java_lang_reflect_Constructor;
132 utf *utf_java_lang_reflect_Field;
133 utf *utf_java_lang_reflect_Method;
134
135 # if defined(WITH_CLASSPATH_GNU)
136 utf *utf_java_lang_reflect_VMConstructor;
137 utf *utf_java_lang_reflect_VMField;
138 utf *utf_java_lang_reflect_VMMethod;
139 # endif
140
141 utf *utf_java_util_Vector;
142 #endif
143
144 utf *utf_InnerClasses;                  /* InnerClasses                       */
145 utf *utf_ConstantValue;                 /* ConstantValue                      */
146 utf *utf_Code;                          /* Code                               */
147 utf *utf_Exceptions;                    /* Exceptions                         */
148 utf *utf_LineNumberTable;               /* LineNumberTable                    */
149 utf *utf_SourceFile;                    /* SourceFile                         */
150
151 #if defined(ENABLE_JAVASE)
152 utf *utf_EnclosingMethod;
153 utf *utf_Signature;
154 utf *utf_StackMapTable;
155
156 #if defined(ENABLE_ANNOTATIONS)
157 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
158 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
159 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
160 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
161 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
162 #endif
163 #endif
164
165 utf *utf_init;                          /* <init>                             */
166 utf *utf_clinit;                        /* <clinit>                           */
167 utf *utf_clone;                         /* clone                              */
168 utf *utf_finalize;                      /* finalize                           */
169 utf *utf_invoke;
170 utf *utf_main;
171 utf *utf_run;                           /* run                                */
172
173 utf *utf_add;
174 utf *utf_remove;
175 utf *utf_addThread;
176 utf *utf_removeThread;
177 utf *utf_put;
178 utf *utf_get;
179 utf *utf_uncaughtException;
180 utf *utf_value;
181
182 utf *utf_fillInStackTrace;
183 utf *utf_findNative;
184 utf *utf_getSystemClassLoader;
185 utf *utf_initCause;
186 utf *utf_loadClass;
187 utf *utf_loadClassInternal;
188 utf *utf_printStackTrace;
189
190 utf *utf_division_by_zero;
191
192 utf *utf_Z;                             /* Z                                  */
193 utf *utf_B;                             /* B                                  */
194 utf *utf_C;                             /* C                                  */
195 utf *utf_S;                             /* S                                  */
196 utf *utf_I;                             /* I                                  */
197 utf *utf_J;                             /* J                                  */
198 utf *utf_F;                             /* F                                  */
199 utf *utf_D;                             /* D                                  */
200
201 utf *utf_void__void;                    /* ()V                                */
202 utf *utf_boolean__void;                 /* (Z)V                               */
203 utf *utf_byte__void;                    /* (B)V                               */
204 utf *utf_char__void;                    /* (C)V                               */
205 utf *utf_short__void;                   /* (S)V                               */
206 utf *utf_int__void;                     /* (I)V                               */
207 utf *utf_long__void;                    /* (J)V                               */
208 utf *utf_float__void;                   /* (F)V                               */
209 utf *utf_double__void;                  /* (D)V                               */
210
211 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
212 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
213 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
214 utf *utf_java_lang_ClassLoader_java_lang_String__J;
215 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
216 utf *utf_java_lang_Object__java_lang_Object;
217 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
218 utf *utf_java_lang_String__java_lang_Class;
219 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
220 utf *utf_java_lang_Thread_java_lang_Throwable__V;
221 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
222 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
223 utf *utf_java_lang_Throwable__java_lang_Throwable;
224
225 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
226 utf *utf_null;
227 utf *array_packagename;
228
229
230 /* utf_init ********************************************************************
231
232    Initializes the utf8 subsystem.
233
234 *******************************************************************************/
235
236 void utf8_init(void)
237 {
238         TRACESUBSYSTEMINITIALIZATION("utf8_init");
239
240         /* create utf8 hashtable */
241
242         hashtable_utf = NEW(hashtable);
243
244         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
245
246 #if defined(ENABLE_STATISTICS)
247         if (opt_stat)
248                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
249 #endif
250
251         /* create utf-symbols for pointer comparison of frequently used strings */
252
253         utf_java_lang_Object           = utf_new_char("java/lang/Object");
254
255         utf_java_lang_Class            = utf_new_char("java/lang/Class");
256         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
257         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
258         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
259         utf_java_lang_String           = utf_new_char("java/lang/String");
260         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
261
262         utf_java_lang_ref_SoftReference =
263                 utf_new_char("java/lang/ref/SoftReference");
264
265         utf_java_lang_ref_WeakReference =
266                 utf_new_char("java/lang/ref/WeakReference");
267
268         utf_java_lang_ref_PhantomReference =
269                 utf_new_char("java/lang/ref/PhantomReference");
270
271         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
272
273         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
274         utf_java_lang_Error            = utf_new_char("java/lang/Error");
275
276         utf_java_lang_ClassCircularityError =
277                 utf_new_char("java/lang/ClassCircularityError");
278
279         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
280
281         utf_java_lang_ExceptionInInitializerError =
282                 utf_new_char("java/lang/ExceptionInInitializerError");
283
284         utf_java_lang_IncompatibleClassChangeError =
285                 utf_new_char("java/lang/IncompatibleClassChangeError");
286
287         utf_java_lang_InstantiationError =
288                 utf_new_char("java/lang/InstantiationError");
289
290         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
291         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
292
293         utf_java_lang_NoClassDefFoundError =
294                 utf_new_char("java/lang/NoClassDefFoundError");
295
296         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
297
298         utf_java_lang_UnsatisfiedLinkError =
299                 utf_new_char("java/lang/UnsatisfiedLinkError");
300
301         utf_java_lang_UnsupportedClassVersionError =
302                 utf_new_char("java/lang/UnsupportedClassVersionError");
303
304         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
305
306         utf_java_lang_VirtualMachineError =
307                 utf_new_char("java/lang/VirtualMachineError");
308
309 #if defined(ENABLE_JAVASE)
310         utf_java_lang_AbstractMethodError =
311                 utf_new_char("java/lang/AbstractMethodError");
312
313         utf_java_lang_NoSuchFieldError =
314                 utf_new_char("java/lang/NoSuchFieldError");
315
316         utf_java_lang_NoSuchMethodError =
317                 utf_new_char("java/lang/NoSuchMethodError");
318 #endif
319
320         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
321
322         utf_java_lang_ArithmeticException =
323                 utf_new_char("java/lang/ArithmeticException");
324
325         utf_java_lang_ArrayIndexOutOfBoundsException =
326                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
327
328         utf_java_lang_ArrayStoreException =
329                 utf_new_char("java/lang/ArrayStoreException");
330
331         utf_java_lang_ClassCastException =
332                 utf_new_char("java/lang/ClassCastException");
333
334         utf_java_lang_ClassNotFoundException =
335                 utf_new_char("java/lang/ClassNotFoundException");
336
337         utf_java_lang_CloneNotSupportedException =
338                 utf_new_char("java/lang/CloneNotSupportedException");
339
340         utf_java_lang_IllegalAccessException =
341                 utf_new_char("java/lang/IllegalAccessException");
342
343         utf_java_lang_IllegalArgumentException =
344                 utf_new_char("java/lang/IllegalArgumentException");
345
346         utf_java_lang_IllegalMonitorStateException =
347                 utf_new_char("java/lang/IllegalMonitorStateException");
348
349         utf_java_lang_InstantiationException =
350                 utf_new_char("java/lang/InstantiationException");
351
352         utf_java_lang_InterruptedException =
353                 utf_new_char("java/lang/InterruptedException");
354
355         utf_java_lang_NegativeArraySizeException =
356                 utf_new_char("java/lang/NegativeArraySizeException");
357
358         utf_java_lang_NullPointerException =
359                 utf_new_char("java/lang/NullPointerException");
360
361         utf_java_lang_RuntimeException =
362                 utf_new_char("java/lang/RuntimeException");
363
364         utf_java_lang_StringIndexOutOfBoundsException =
365                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
366
367         utf_java_lang_reflect_InvocationTargetException =
368                 utf_new_char("java/lang/reflect/InvocationTargetException");
369
370         utf_java_security_PrivilegedActionException =
371                 utf_new_char("java/security/PrivilegedActionException");
372  
373 #if defined(ENABLE_JAVASE)
374         utf_java_lang_Void             = utf_new_char("java/lang/Void");
375 #endif
376
377         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
378         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
379         utf_java_lang_Character        = utf_new_char("java/lang/Character");
380         utf_java_lang_Short            = utf_new_char("java/lang/Short");
381         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
382         utf_java_lang_Long             = utf_new_char("java/lang/Long");
383         utf_java_lang_Float            = utf_new_char("java/lang/Float");
384         utf_java_lang_Double           = utf_new_char("java/lang/Double");
385
386 #if defined(ENABLE_JAVASE)
387         utf_java_lang_StackTraceElement =
388                 utf_new_char("java/lang/StackTraceElement");
389
390         utf_java_lang_reflect_Constructor =
391                 utf_new_char("java/lang/reflect/Constructor");
392
393         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
394         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
395
396 # if defined(WITH_CLASSPATH_GNU)
397         utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
398         utf_java_lang_reflect_VMField       = utf_new_char("java/lang/reflect/VMField");
399         utf_java_lang_reflect_VMMethod      = utf_new_char("java/lang/reflect/VMMethod");
400 # endif
401
402         utf_java_util_Vector           = utf_new_char("java/util/Vector");
403 #endif
404
405         utf_InnerClasses               = utf_new_char("InnerClasses");
406         utf_ConstantValue              = utf_new_char("ConstantValue");
407         utf_Code                       = utf_new_char("Code");
408         utf_Exceptions                 = utf_new_char("Exceptions");
409         utf_LineNumberTable            = utf_new_char("LineNumberTable");
410         utf_SourceFile                 = utf_new_char("SourceFile");
411
412 #if defined(ENABLE_JAVASE)
413         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
414         utf_Signature                  = utf_new_char("Signature");
415         utf_StackMapTable              = utf_new_char("StackMapTable");
416
417 # if defined(ENABLE_ANNOTATIONS)
418         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
419         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
420         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
421         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
422         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
423 # endif
424 #endif
425
426         utf_init                           = utf_new_char("<init>");
427         utf_clinit                         = utf_new_char("<clinit>");
428         utf_clone                      = utf_new_char("clone");
429         utf_finalize                   = utf_new_char("finalize");
430         utf_invoke                     = utf_new_char("invoke");
431         utf_main                       = utf_new_char("main");
432         utf_run                        = utf_new_char("run");
433
434         utf_add                        = utf_new_char("add");
435         utf_remove                     = utf_new_char("remove");
436         utf_addThread                  = utf_new_char("addThread");
437         utf_removeThread               = utf_new_char("removeThread");
438         utf_put                        = utf_new_char("put");
439         utf_get                        = utf_new_char("get");
440         utf_uncaughtException          = utf_new_char("uncaughtException");
441         utf_value                      = utf_new_char("value");
442
443         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
444         utf_findNative                 = utf_new_char("findNative");
445         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
446         utf_initCause                  = utf_new_char("initCause");
447         utf_loadClass                  = utf_new_char("loadClass");
448         utf_loadClassInternal          = utf_new_char("loadClassInternal");
449         utf_printStackTrace            = utf_new_char("printStackTrace");
450
451         utf_division_by_zero           = utf_new_char("/ by zero");
452
453         utf_Z                          = utf_new_char("Z");
454         utf_B                          = utf_new_char("B");
455         utf_C                          = utf_new_char("C");
456         utf_S                          = utf_new_char("S");
457         utf_I                          = utf_new_char("I");
458         utf_J                          = utf_new_char("J");
459         utf_F                          = utf_new_char("F");
460         utf_D                          = utf_new_char("D");
461
462         utf_void__void                 = utf_new_char("()V");
463         utf_boolean__void              = utf_new_char("(Z)V");
464         utf_byte__void                 = utf_new_char("(B)V");
465         utf_char__void                 = utf_new_char("(C)V");
466         utf_short__void                = utf_new_char("(S)V");
467         utf_int__void                  = utf_new_char("(I)V");
468         utf_long__void                 = utf_new_char("(J)V");
469         utf_float__void                = utf_new_char("(F)V");
470         utf_double__void               = utf_new_char("(D)V");
471         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
472         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
473
474         utf_void__java_lang_ClassLoader =
475                 utf_new_char("()Ljava/lang/ClassLoader;");
476
477         utf_java_lang_ClassLoader_java_lang_String__J =
478                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
479
480         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
481
482         utf_java_lang_Object__java_lang_Object =
483                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
484
485         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
486
487         utf_java_lang_String__java_lang_Class =
488                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
489
490         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
491
492         utf_java_lang_Thread_java_lang_Throwable__V =
493                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
494
495         utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
496                 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
497
498         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
499
500         utf_java_lang_Throwable__java_lang_Throwable =
501                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
502
503         utf_null                       = utf_new_char("null");
504         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
505         array_packagename              = utf_new_char("\t<the array package>");
506 }
507
508
509 /* utf_hashkey *****************************************************************
510
511    The hashkey is computed from the utf-text by using up to 8
512    characters.  For utf-symbols longer than 15 characters 3 characters
513    are taken from the beginning and the end, 2 characters are taken
514    from the middle.
515
516 *******************************************************************************/
517
518 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
519 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
520
521 u4 utf_hashkey(const char *text, u4 length)
522 {
523         const char *start_pos = text;       /* pointer to utf text                */
524         u4 a;
525
526         switch (length) {
527         case 0: /* empty string */
528                 return 0;
529
530         case 1: return fbs(0);
531         case 2: return fbs(0) ^ nbs(3);
532         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
533         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
534         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
535         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
536         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
537         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
538
539         case 9:
540                 a = fbs(0);
541                 a ^= nbs(1);
542                 a ^= nbs(2);
543                 text++;
544                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
545
546         case 10:
547                 a = fbs(0);
548                 text++;
549                 a ^= nbs(2);
550                 a ^= nbs(3);
551                 a ^= nbs(4);
552                 text++;
553                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
554
555         case 11:
556                 a = fbs(0);
557                 text++;
558                 a ^= nbs(2);
559                 a ^= nbs(3);
560                 a ^= nbs(4);
561                 text++;
562                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
563
564         case 12:
565                 a = fbs(0);
566                 text += 2;
567                 a ^= nbs(2);
568                 a ^= nbs(3);
569                 text++;
570                 a ^= nbs(5);
571                 a ^= nbs(6);
572                 a ^= nbs(7);
573                 text++;
574                 return a ^ nbs(9) ^ nbs(10);
575
576         case 13:
577                 a = fbs(0);
578                 a ^= nbs(1);
579                 text++;
580                 a ^= nbs(3);
581                 a ^= nbs(4);
582                 text += 2;      
583                 a ^= nbs(7);
584                 a ^= nbs(8);
585                 text += 2;
586                 return a ^ nbs(9) ^ nbs(10);
587
588         case 14:
589                 a = fbs(0);
590                 text += 2;      
591                 a ^= nbs(3);
592                 a ^= nbs(4);
593                 text += 2;      
594                 a ^= nbs(7);
595                 a ^= nbs(8);
596                 text += 2;
597                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
598
599         case 15:
600                 a = fbs(0);
601                 text += 2;      
602                 a ^= nbs(3);
603                 a ^= nbs(4);
604                 text += 2;      
605                 a ^= nbs(7);
606                 a ^= nbs(8);
607                 text += 2;
608                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
609
610         default:  /* 3 characters from beginning */
611                 a = fbs(0);
612                 text += 2;
613                 a ^= nbs(3);
614                 a ^= nbs(4);
615
616                 /* 2 characters from middle */
617                 text = start_pos + (length / 2);
618                 a ^= fbs(5);
619                 text += 2;
620                 a ^= nbs(6);    
621
622                 /* 3 characters from end */
623                 text = start_pos + length - 4;
624
625                 a ^= fbs(7);
626                 text++;
627
628                 return a ^ nbs(10) ^ nbs(11);
629     }
630 }
631
632 /* utf_full_hashkey ************************************************************
633
634    This function computes a hash value using all bytes in the string.
635
636    The algorithm is the "One-at-a-time" algorithm as published
637    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
638
639 *******************************************************************************/
640
641 u4 utf_full_hashkey(const char *text, u4 length)
642 {
643         register const unsigned char *p = (const unsigned char *) text;
644         register u4 hash;
645         register u4 i;
646
647         hash = 0;
648         for (i=length; i--;)
649         {
650             hash += *p++;
651             hash += (hash << 10);
652             hash ^= (hash >> 6);
653         }
654         hash += (hash << 3);
655         hash ^= (hash >> 11);
656         hash += (hash << 15);
657
658         return hash;
659 }
660
661 /* unicode_hashkey *************************************************************
662
663    Compute the hashkey of a unicode string.
664
665 *******************************************************************************/
666
667 u4 unicode_hashkey(u2 *text, u2 len)
668 {
669         return utf_hashkey((char *) text, len);
670 }
671
672
673 /* utf_new *********************************************************************
674
675    Creates a new utf-symbol, the text of the symbol is passed as a
676    u1-array. The function searches the utf-hashtable for a utf-symbol
677    with this text. On success the element returned, otherwise a new
678    hashtable element is created.
679
680    If the number of entries in the hashtable exceeds twice the size of
681    the hashtable slots a reorganization of the hashtable is done and
682    the utf symbols are copied to a new hashtable with doubled size.
683
684 *******************************************************************************/
685
686 utf *utf_new(const char *text, u2 length)
687 {
688         u4 key;                             /* hashkey computed from utf-text     */
689         u4 slot;                            /* slot in hashtable                  */
690         utf *u;                             /* hashtable element                  */
691         u2 i;
692
693         LOCK_MONITOR_ENTER(hashtable_utf->header);
694
695 #if defined(ENABLE_STATISTICS)
696         if (opt_stat)
697                 count_utf_new++;
698 #endif
699
700         key  = utf_hashkey(text, length);
701         slot = key & (hashtable_utf->size - 1);
702         u    = hashtable_utf->ptr[slot];
703
704         /* search external hash chain for utf-symbol */
705
706         while (u) {
707                 if (u->blength == length) {
708                         /* compare text of hashtable elements */
709
710                         for (i = 0; i < length; i++)
711                                 if (text[i] != u->text[i])
712                                         goto nomatch;
713                         
714 #if defined(ENABLE_STATISTICS)
715                         if (opt_stat)
716                                 count_utf_new_found++;
717 #endif
718
719                         /* symbol found in hashtable */
720
721                         LOCK_MONITOR_EXIT(hashtable_utf->header);
722
723                         return u;
724                 }
725
726         nomatch:
727                 u = u->hashlink; /* next element in external chain */
728         }
729
730         /* location in hashtable found, create new utf element */
731
732         u = NEW(utf);
733
734         u->blength  = length;               /* length in bytes of utfstring       */
735         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
736         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
737
738         memcpy(u->text, text, length);      /* copy utf-text                      */
739         u->text[length] = '\0';
740
741 #if defined(ENABLE_STATISTICS)
742         if (opt_stat)
743                 count_utf_len += sizeof(utf) + length + 1;
744 #endif
745
746         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
747         hashtable_utf->entries++;           /* update number of entries           */
748
749         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
750
751         /* reorganization of hashtable, average length of the external
752            chains is approx. 2 */
753
754                 hashtable *newhash;                              /* the new hashtable */
755                 u4         i;
756                 utf       *u;
757                 utf       *nextu;
758                 u4         slot;
759
760                 /* create new hashtable, double the size */
761
762                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
763
764 #if defined(ENABLE_STATISTICS)
765                 if (opt_stat)
766                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
767 #endif
768
769                 /* transfer elements to new hashtable */
770
771                 for (i = 0; i < hashtable_utf->size; i++) {
772                         u = hashtable_utf->ptr[i];
773
774                         while (u) {
775                                 nextu = u->hashlink;
776                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
777                                                 
778                                 u->hashlink = (utf *) newhash->ptr[slot];
779                                 newhash->ptr[slot] = u;
780
781                                 /* follow link in external hash chain */
782
783                                 u = nextu;
784                         }
785                 }
786         
787                 /* dispose old table */
788
789                 hashtable_free(hashtable_utf);
790
791                 hashtable_utf = newhash;
792         }
793
794         LOCK_MONITOR_EXIT(hashtable_utf->header);
795
796         return u;
797 }
798
799
800 /* utf_new_u2 ******************************************************************
801
802    Make utf symbol from u2 array, if isclassname is true '.' is
803    replaced by '/'.
804
805 *******************************************************************************/
806
807 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
808 {
809         char *buffer;                   /* memory buffer for  unicode characters  */
810         char *pos;                      /* pointer to current position in buffer  */
811         u4 left;                        /* unicode characters left                */
812         u4 buflength;                   /* utf length in bytes of the u2 array    */
813         utf *result;                    /* resulting utf-string                   */
814         int i;          
815
816         /* determine utf length in bytes and allocate memory */
817
818         buflength = u2_utflength(unicode_pos, unicode_length); 
819         buffer    = MNEW(char, buflength);
820  
821         left = buflength;
822         pos  = buffer;
823
824         for (i = 0; i++ < unicode_length; unicode_pos++) {
825                 /* next unicode character */
826                 u2 c = *unicode_pos;
827                 
828                 if ((c != 0) && (c < 0x80)) {
829                         /* 1 character */       
830                         left--;
831                 if ((int) left < 0) break;
832                         /* convert classname */
833                         if (isclassname && c == '.')
834                                 *pos++ = '/';
835                         else
836                                 *pos++ = (char) c;
837
838                 } else if (c < 0x800) {             
839                         /* 2 characters */                              
840                 unsigned char high = c >> 6;
841                 unsigned char low  = c & 0x3F;
842                         left = left - 2;
843                 if ((int) left < 0) break;
844                 *pos++ = high | 0xC0; 
845                 *pos++ = low  | 0x80;     
846
847                 } else {         
848                 /* 3 characters */                              
849                 char low  = c & 0x3f;
850                 char mid  = (c >> 6) & 0x3F;
851                 char high = c >> 12;
852                         left = left - 3;
853                 if ((int) left < 0) break;
854                 *pos++ = high | 0xE0; 
855                 *pos++ = mid  | 0x80;  
856                 *pos++ = low  | 0x80;   
857                 }
858         }
859         
860         /* insert utf-string into symbol-table */
861         result = utf_new(buffer,buflength);
862
863         MFREE(buffer, char, buflength);
864
865         return result;
866 }
867
868
869 /* utf_new_char ****************************************************************
870
871    Creates a new utf symbol, the text for this symbol is passed as a
872    c-string ( = char* ).
873
874 *******************************************************************************/
875
876 utf *utf_new_char(const char *text)
877 {
878         return utf_new(text, strlen(text));
879 }
880
881
882 /* utf_new_char_classname ******************************************************
883
884    Creates a new utf symbol, the text for this symbol is passed as a
885    c-string ( = char* ) "." characters are going to be replaced by
886    "/". Since the above function is used often, this is a separte
887    function, instead of an if.
888
889 *******************************************************************************/
890
891 utf *utf_new_char_classname(const char *text)
892 {
893         if (strchr(text, '.')) {
894                 char *txt = strdup(text);
895                 char *end = txt + strlen(txt);
896                 char *c;
897                 utf *tmpRes;
898
899                 for (c = txt; c < end; c++)
900                         if (*c == '.') *c = '/';
901
902                 tmpRes = utf_new(txt, strlen(txt));
903                 FREE(txt, 0);
904
905                 return tmpRes;
906
907         } else
908                 return utf_new(text, strlen(text));
909 }
910
911
912 /* utf_nextu2 ******************************************************************
913
914    Read the next unicode character from the utf string and increment
915    the utf-string pointer accordingly.
916
917    CAUTION: This function is unsafe for input that was not checked 
918             by is_valid_utf!
919
920 *******************************************************************************/
921
922 u2 utf_nextu2(char **utf_ptr)
923 {
924     /* uncompressed unicode character */
925     u2 unicode_char = 0;
926     /* current position in utf text */  
927     unsigned char *utf = (unsigned char *) (*utf_ptr);
928     /* bytes representing the unicode character */
929     unsigned char ch1, ch2, ch3;
930     /* number of bytes used to represent the unicode character */
931     int len = 0;
932         
933     switch ((ch1 = utf[0]) >> 4) {
934         default: /* 1 byte */
935                 (*utf_ptr)++;
936                 return (u2) ch1;
937         case 0xC: 
938         case 0xD: /* 2 bytes */
939                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
940                         unsigned char high = ch1 & 0x1F;
941                         unsigned char low  = ch2 & 0x3F;
942                         unicode_char = (high << 6) + low;
943                         len = 2;
944                 }
945                 break;
946
947         case 0xE: /* 2 or 3 bytes */
948                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
949                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
950                                 unsigned char low  = ch3 & 0x3f;
951                                 unsigned char mid  = ch2 & 0x3f;
952                                 unsigned char high = ch1 & 0x0f;
953                                 unicode_char = (((high << 6) + mid) << 6) + low;
954                                 len = 3;
955                         } else
956                                 len = 2;                                           
957                 }
958                 break;
959     }
960
961     /* update position in utf-text */
962     *utf_ptr = (char *) (utf + len);
963
964     return unicode_char;
965 }
966
967
968 /* utf_bytes *******************************************************************
969
970    Determine number of bytes (aka. octets) in the utf string.
971
972    IN:
973       u............utf string
974
975    OUT:
976       The number of octets of this utf string.
977           There is _no_ terminating zero included in this count.
978
979 *******************************************************************************/
980
981 u4 utf_bytes(utf *u)
982 {
983         return u->blength;
984 }
985
986
987 /* utf_get_number_of_u2s_for_buffer ********************************************
988
989    Determine number of UTF-16 u2s in the given UTF-8 buffer
990
991    CAUTION: This function is unsafe for input that was not checked 
992             by is_valid_utf!
993
994    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
995    to an array of u2s (UTF-16) and want to know how many of them you will get.
996    All other uses of this function are probably wrong.
997
998    IN:
999       buffer........points to first char in buffer
1000           blength.......number of _bytes_ in the buffer
1001
1002    OUT:
1003       the number of u2s needed to hold this string in UTF-16 encoding.
1004           There is _no_ terminating zero included in this count.
1005
1006    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1007    exception.
1008
1009 *******************************************************************************/
1010
1011 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1012 {
1013         const char *endpos;                 /* points behind utf string           */
1014         const char *utf_ptr;                /* current position in utf text       */
1015         u4 len = 0;                         /* number of unicode characters       */
1016
1017         utf_ptr = buffer;
1018         endpos = utf_ptr + blength;
1019
1020         while (utf_ptr < endpos) {
1021                 len++;
1022                 /* next unicode character */
1023                 utf_nextu2((char **)&utf_ptr);
1024         }
1025
1026         assert(utf_ptr == endpos);
1027
1028         return len;
1029 }
1030
1031
1032 /* utf_get_number_of_u2s *******************************************************
1033
1034    Determine number of UTF-16 u2s in the utf string.
1035
1036    CAUTION: This function is unsafe for input that was not checked 
1037             by is_valid_utf!
1038
1039    CAUTION: Use this function *only* when you want to convert a utf string
1040    to an array of u2s and want to know how many of them you will get.
1041    All other uses of this function are probably wrong.
1042
1043    IN:
1044       u............utf string
1045
1046    OUT:
1047       the number of u2s needed to hold this string in UTF-16 encoding.
1048           There is _no_ terminating zero included in this count.
1049           XXX 0 if a NullPointerException has been thrown (see below)
1050
1051 *******************************************************************************/
1052
1053 u4 utf_get_number_of_u2s(utf *u)
1054 {
1055         char *endpos;                       /* points behind utf string           */
1056         char *utf_ptr;                      /* current position in utf text       */
1057         u4 len = 0;                         /* number of unicode characters       */
1058
1059         /* XXX this is probably not checked by most callers! Review this after */
1060         /* the invalid uses of this function have been eliminated */
1061         if (u == NULL) {
1062                 exceptions_throw_nullpointerexception();
1063                 return 0;
1064         }
1065
1066         endpos = UTF_END(u);
1067         utf_ptr = u->text;
1068
1069         while (utf_ptr < endpos) {
1070                 len++;
1071                 /* next unicode character */
1072                 utf_nextu2(&utf_ptr);
1073         }
1074
1075         if (utf_ptr != endpos) {
1076                 /* string ended abruptly */
1077                 exceptions_throw_internalerror("Illegal utf8 string");
1078                 return 0;
1079         }
1080
1081         return len;
1082 }
1083
1084
1085 /* utf8_safe_number_of_u2s *****************************************************
1086
1087    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1088    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1089
1090    This function is safe even for invalid UTF-8 strings.
1091
1092    IN:
1093       text..........zero-terminated(!) UTF-8 string (may be invalid)
1094                         must NOT be NULL
1095           nbytes........strlen(text). (This is needed to completely emulate
1096                         the RI).
1097
1098    OUT:
1099       the number of u2s needed to hold this string in UTF-16 encoding.
1100           There is _no_ terminating zero included in this count.
1101
1102 *******************************************************************************/
1103
1104 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1105         register const unsigned char *t;
1106         register s4 byte;
1107         register s4 len;
1108         register const unsigned char *tlimit;
1109         s4 byte1;
1110         s4 byte2;
1111         s4 byte3;
1112         s4 value;
1113         s4 skip;
1114
1115         assert(text);
1116         assert(nbytes >= 0);
1117
1118         len = 0;
1119         t = (const unsigned char *) text;
1120         tlimit = t + nbytes;
1121
1122         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1123
1124         while (1) {
1125                 byte = *t++;
1126
1127                 if (byte & 0x80) {
1128                         /* highest bit set, non-ASCII character */
1129
1130                         if ((byte & 0xe0) == 0xc0) {
1131                                 /* 2-byte: should be 110..... 10...... ? */
1132
1133                                 if ((*t++ & 0xc0) == 0x80)
1134                                         ; /* valid 2-byte */
1135                                 else
1136                                         t--; /* invalid */
1137                         }
1138                         else if ((byte & 0xf0) == 0xe0) {
1139                                 /* 3-byte: should be 1110.... 10...... 10...... */
1140                                 /*                            ^t                */
1141
1142                                 if (t + 2 > tlimit)
1143                                         return len + 1; /* invalid, stop here */
1144
1145                                 if ((*t++ & 0xc0) == 0x80) {
1146                                         if ((*t++ & 0xc0) == 0x80)
1147                                                 ; /* valid 3-byte */
1148                                         else
1149                                                 t--; /* invalid */
1150                                 }
1151                                 else
1152                                         t--; /* invalid */
1153                         }
1154                         else if ((byte & 0xf8) == 0xf0) {
1155                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1156                                 /*                            ^t                         */
1157
1158                                 if (t + 3 > tlimit)
1159                                         return len + 1; /* invalid, stop here */
1160
1161                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1162                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1163                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1164                                                         /* valid 4-byte UTF-8? */
1165                                                         value = ((byte  & 0x07) << 18)
1166                                                                   | ((byte1 & 0x3f) << 12)
1167                                                                   | ((byte2 & 0x3f) <<  6)
1168                                                                   | ((byte3 & 0x3f)      );
1169
1170                                                         if (value > 0x10FFFF)
1171                                                                 ; /* invalid */
1172                                                         else if (value > 0xFFFF)
1173                                                                 len += 1; /* we need surrogates */
1174                                                         else
1175                                                                 ; /* 16bit suffice */
1176                                                 }
1177                                                 else
1178                                                         t--; /* invalid */
1179                                         }
1180                                         else
1181                                                 t--; /* invalid */
1182                                 }
1183                                 else
1184                                         t--; /* invalid */
1185                         }
1186                         else if ((byte & 0xfc) == 0xf8) {
1187                                 /* invalid 5-byte */
1188                                 if (t + 4 > tlimit)
1189                                         return len + 1; /* invalid, stop here */
1190
1191                                 skip = 4;
1192                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1193                                         t++;
1194                         }
1195                         else if ((byte & 0xfe) == 0xfc) {
1196                                 /* invalid 6-byte */
1197                                 if (t + 5 > tlimit)
1198                                         return len + 1; /* invalid, stop here */
1199
1200                                 skip = 5;
1201                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1202                                         t++;
1203                         }
1204                         else
1205                                 ; /* invalid */
1206                 }
1207                 else {
1208                         /* NUL */
1209
1210                         if (byte == 0)
1211                                 break;
1212
1213                         /* ASCII character, common case */
1214                 }
1215
1216                 len++;
1217         }
1218
1219         return len;
1220 }
1221
1222
1223 /* utf8_safe_convert_to_u2s ****************************************************
1224
1225    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1226    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1227    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1228
1229    This function is safe even for invalid UTF-8 strings.
1230
1231    IN:
1232       text..........zero-terminated(!) UTF-8 string (may be invalid)
1233                         must NOT be NULL
1234           nbytes........strlen(text). (This is needed to completely emulate
1235                                         the RI).
1236           buffer........a preallocated array of u2s to receive the decoded
1237                         string. Use utf8_safe_number_of_u2s to get the
1238                                         required number of u2s for allocating this.
1239
1240 *******************************************************************************/
1241
1242 #define UNICODE_REPLACEMENT  0xfffd
1243
1244 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1245         register const unsigned char *t;
1246         register s4 byte;
1247         register const unsigned char *tlimit;
1248         s4 byte1;
1249         s4 byte2;
1250         s4 byte3;
1251         s4 value;
1252         s4 skip;
1253
1254         assert(text);
1255         assert(nbytes >= 0);
1256
1257         t = (const unsigned char *) text;
1258         tlimit = t + nbytes;
1259
1260         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1261
1262         while (1) {
1263                 byte = *t++;
1264
1265                 if (byte & 0x80) {
1266                         /* highest bit set, non-ASCII character */
1267
1268                         if ((byte & 0xe0) == 0xc0) {
1269                                 /* 2-byte: should be 110..... 10...... */
1270
1271                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1272                                         /* valid 2-byte UTF-8 */
1273                                         *buffer++ = ((byte  & 0x1f) << 6)
1274                                                           | ((byte1 & 0x3f)     );
1275                                 }
1276                                 else {
1277                                         *buffer++ = UNICODE_REPLACEMENT;
1278                                         t--;
1279                                 }
1280                         }
1281                         else if ((byte & 0xf0) == 0xe0) {
1282                                 /* 3-byte: should be 1110.... 10...... 10...... */
1283
1284                                 if (t + 2 > tlimit) {
1285                                         *buffer++ = UNICODE_REPLACEMENT;
1286                                         return;
1287                                 }
1288
1289                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1290                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1291                                                 /* valid 3-byte UTF-8 */
1292                                                 *buffer++ = ((byte  & 0x0f) << 12)
1293                                                                   | ((byte1 & 0x3f) <<  6)
1294                                                                   | ((byte2 & 0x3f)      );
1295                                         }
1296                                         else {
1297                                                 *buffer++ = UNICODE_REPLACEMENT;
1298                                                 t--;
1299                                         }
1300                                 }
1301                                 else {
1302                                         *buffer++ = UNICODE_REPLACEMENT;
1303                                         t--;
1304                                 }
1305                         }
1306                         else if ((byte & 0xf8) == 0xf0) {
1307                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1308
1309                                 if (t + 3 > tlimit) {
1310                                         *buffer++ = UNICODE_REPLACEMENT;
1311                                         return;
1312                                 }
1313
1314                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1315                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1316                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1317                                                         /* valid 4-byte UTF-8? */
1318                                                         value = ((byte  & 0x07) << 18)
1319                                                                   | ((byte1 & 0x3f) << 12)
1320                                                                   | ((byte2 & 0x3f) <<  6)
1321                                                                   | ((byte3 & 0x3f)      );
1322
1323                                                         if (value > 0x10FFFF) {
1324                                                                 *buffer++ = UNICODE_REPLACEMENT;
1325                                                         }
1326                                                         else if (value > 0xFFFF) {
1327                                                                 /* we need surrogates */
1328                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1329                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1330                                                         }
1331                                                         else
1332                                                                 *buffer++ = value; /* 16bit suffice */
1333                                                 }
1334                                                 else {
1335                                                         *buffer++ = UNICODE_REPLACEMENT;
1336                                                         t--;
1337                                                 }
1338                                         }
1339                                         else {
1340                                                 *buffer++ = UNICODE_REPLACEMENT;
1341                                                 t--;
1342                                         }
1343                                 }
1344                                 else {
1345                                         *buffer++ = UNICODE_REPLACEMENT;
1346                                         t--;
1347                                 }
1348                         }
1349                         else if ((byte & 0xfc) == 0xf8) {
1350                                 if (t + 4 > tlimit) {
1351                                         *buffer++ = UNICODE_REPLACEMENT;
1352                                         return;
1353                                 }
1354
1355                                 skip = 4;
1356                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1357                                         t++;
1358                                 *buffer++ = UNICODE_REPLACEMENT;
1359                         }
1360                         else if ((byte & 0xfe) == 0xfc) {
1361                                 if (t + 5 > tlimit) {
1362                                         *buffer++ = UNICODE_REPLACEMENT;
1363                                         return;
1364                                 }
1365
1366                                 skip = 5;
1367                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1368                                         t++;
1369                                 *buffer++ = UNICODE_REPLACEMENT;
1370                         }
1371                         else
1372                                 *buffer++ = UNICODE_REPLACEMENT;
1373                 }
1374                 else {
1375                         /* NUL */
1376
1377                         if (byte == 0)
1378                                 break;
1379
1380                         /* ASCII character, common case */
1381
1382                         *buffer++ = byte;
1383                 }
1384         }
1385 }
1386
1387
1388 /* u2_utflength ****************************************************************
1389
1390    Returns the utf length in bytes of a u2 array.
1391
1392 *******************************************************************************/
1393
1394 u4 u2_utflength(u2 *text, u4 u2_length)
1395 {
1396         u4 result_len = 0;                  /* utf length in bytes                */
1397         u2 ch;                              /* current unicode character          */
1398         u4 len;
1399         
1400         for (len = 0; len < u2_length; len++) {
1401                 /* next unicode character */
1402                 ch = *text++;
1403           
1404                 /* determine bytes required to store unicode character as utf */
1405                 if (ch && (ch < 0x80)) 
1406                         result_len++;
1407                 else if (ch < 0x800)
1408                         result_len += 2;        
1409                 else 
1410                         result_len += 3;        
1411         }
1412
1413     return result_len;
1414 }
1415
1416
1417 /* utf_copy ********************************************************************
1418
1419    Copy the given utf string byte-for-byte to a buffer.
1420
1421    IN:
1422       buffer.......the buffer
1423           u............the utf string
1424
1425 *******************************************************************************/
1426
1427 void utf_copy(char *buffer, utf *u)
1428 {
1429         /* our utf strings are zero-terminated (done by utf_new) */
1430         MCOPY(buffer, u->text, char, u->blength + 1);
1431 }
1432
1433
1434 /* utf_cat *********************************************************************
1435
1436    Append the given utf string byte-for-byte to a buffer.
1437
1438    IN:
1439       buffer.......the buffer
1440           u............the utf string
1441
1442 *******************************************************************************/
1443
1444 void utf_cat(char *buffer, utf *u)
1445 {
1446         /* our utf strings are zero-terminated (done by utf_new) */
1447         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1448 }
1449
1450
1451 /* utf_copy_classname **********************************************************
1452
1453    Copy the given utf classname byte-for-byte to a buffer.
1454    '/' is replaced by '.'
1455
1456    IN:
1457       buffer.......the buffer
1458           u............the utf string
1459
1460 *******************************************************************************/
1461
1462 void utf_copy_classname(char *buffer, utf *u)
1463 {
1464         char *bufptr;
1465         char *srcptr;
1466         char *endptr;
1467         char ch;
1468
1469         bufptr = buffer;
1470         srcptr = u->text;
1471         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1472
1473         while (srcptr != endptr) {
1474                 ch = *srcptr++;
1475                 if (ch == '/')
1476                         ch = '.';
1477                 *bufptr++ = ch;
1478         }
1479 }
1480
1481
1482 /* utf_cat *********************************************************************
1483
1484    Append the given utf classname byte-for-byte to a buffer.
1485    '/' is replaced by '.'
1486
1487    IN:
1488       buffer.......the buffer
1489           u............the utf string
1490
1491 *******************************************************************************/
1492
1493 void utf_cat_classname(char *buffer, utf *u)
1494 {
1495         utf_copy_classname(buffer + strlen(buffer), u);
1496 }
1497
1498 /* utf_display_printable_ascii *************************************************
1499
1500    Write utf symbol to stdout (for debugging purposes).
1501    Non-printable and non-ASCII characters are printed as '?'.
1502
1503 *******************************************************************************/
1504
1505 void utf_display_printable_ascii(utf *u)
1506 {
1507         char *endpos;                       /* points behind utf string           */
1508         char *utf_ptr;                      /* current position in utf text       */
1509
1510         if (u == NULL) {
1511                 printf("NULL");
1512                 fflush(stdout);
1513                 return;
1514         }
1515
1516         endpos = UTF_END(u);
1517         utf_ptr = u->text;
1518
1519         while (utf_ptr < endpos) {
1520                 /* read next unicode character */
1521
1522                 u2 c = utf_nextu2(&utf_ptr);
1523
1524                 if ((c >= 32) && (c <= 127))
1525                         printf("%c", c);
1526                 else
1527                         printf("?");
1528         }
1529
1530         fflush(stdout);
1531 }
1532
1533
1534 /* utf_display_printable_ascii_classname ***************************************
1535
1536    Write utf symbol to stdout with `/' converted to `.' (for debugging
1537    purposes).
1538    Non-printable and non-ASCII characters are printed as '?'.
1539
1540 *******************************************************************************/
1541
1542 void utf_display_printable_ascii_classname(utf *u)
1543 {
1544         char *endpos;                       /* points behind utf string           */
1545         char *utf_ptr;                      /* current position in utf text       */
1546
1547         if (u == NULL) {
1548                 printf("NULL");
1549                 fflush(stdout);
1550                 return;
1551         }
1552
1553         endpos = UTF_END(u);
1554         utf_ptr = u->text;
1555
1556         while (utf_ptr < endpos) {
1557                 /* read next unicode character */
1558
1559                 u2 c = utf_nextu2(&utf_ptr);
1560
1561                 if (c == '/')
1562                         c = '.';
1563
1564                 if ((c >= 32) && (c <= 127))
1565                         printf("%c", c);
1566                 else
1567                         printf("?");
1568         }
1569
1570         fflush(stdout);
1571 }
1572
1573
1574 /* utf_sprint_convert_to_latin1 ************************************************
1575         
1576    Write utf symbol into c-string (for debugging purposes).
1577    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1578    invalid results.
1579
1580 *******************************************************************************/
1581
1582 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1583 {
1584         char *endpos;                       /* points behind utf string           */
1585         char *utf_ptr;                      /* current position in utf text       */
1586         u2 pos = 0;                         /* position in c-string               */
1587
1588         if (!u) {
1589                 strcpy(buffer, "NULL");
1590                 return;
1591         }
1592
1593         endpos = UTF_END(u);
1594         utf_ptr = u->text;
1595
1596         while (utf_ptr < endpos) 
1597                 /* copy next unicode character */       
1598                 buffer[pos++] = utf_nextu2(&utf_ptr);
1599
1600         /* terminate string */
1601         buffer[pos] = '\0';
1602 }
1603
1604
1605 /* utf_sprint_convert_to_latin1_classname **************************************
1606         
1607    Write utf symbol into c-string with `/' converted to `.' (for debugging
1608    purposes).
1609    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1610    invalid results.
1611
1612 *******************************************************************************/
1613
1614 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1615 {
1616         char *endpos;                       /* points behind utf string           */
1617         char *utf_ptr;                      /* current position in utf text       */
1618         u2 pos = 0;                         /* position in c-string               */
1619
1620         if (!u) {
1621                 strcpy(buffer, "NULL");
1622                 return;
1623         }
1624
1625         endpos = UTF_END(u);
1626         utf_ptr = u->text;
1627
1628         while (utf_ptr < endpos) {
1629                 /* copy next unicode character */       
1630                 u2 c = utf_nextu2(&utf_ptr);
1631                 if (c == '/') c = '.';
1632                 buffer[pos++] = c;
1633         }
1634
1635         /* terminate string */
1636         buffer[pos] = '\0';
1637 }
1638
1639
1640 /* utf_strcat_convert_to_latin1 ************************************************
1641         
1642    Like libc strcat, but uses an utf8 string.
1643    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1644    invalid results.
1645
1646 *******************************************************************************/
1647
1648 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1649 {
1650         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1651 }
1652
1653
1654 /* utf_strcat_convert_to_latin1_classname **************************************
1655         
1656    Like libc strcat, but uses an utf8 string.
1657    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1658    invalid results.
1659
1660 *******************************************************************************/
1661
1662 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1663 {
1664         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1665 }
1666
1667
1668 /* utf_fprint_printable_ascii **************************************************
1669         
1670    Write utf symbol into file.
1671    Non-printable and non-ASCII characters are printed as '?'.
1672
1673 *******************************************************************************/
1674
1675 void utf_fprint_printable_ascii(FILE *file, utf *u)
1676 {
1677         char *endpos;                       /* points behind utf string           */
1678         char *utf_ptr;                      /* current position in utf text       */
1679
1680         if (!u)
1681                 return;
1682
1683         endpos = UTF_END(u);
1684         utf_ptr = u->text;
1685
1686         while (utf_ptr < endpos) { 
1687                 /* read next unicode character */                
1688                 u2 c = utf_nextu2(&utf_ptr);                            
1689
1690                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1691                 else fprintf(file, "?");
1692         }
1693 }
1694
1695
1696 /* utf_fprint_printable_ascii_classname ****************************************
1697         
1698    Write utf symbol into file with `/' converted to `.'.
1699    Non-printable and non-ASCII characters are printed as '?'.
1700
1701 *******************************************************************************/
1702
1703 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1704 {
1705         char *endpos;                       /* points behind utf string           */
1706         char *utf_ptr;                      /* current position in utf text       */
1707
1708     if (!u)
1709                 return;
1710
1711         endpos = UTF_END(u);
1712         utf_ptr = u->text;
1713
1714         while (utf_ptr < endpos) { 
1715                 /* read next unicode character */                
1716                 u2 c = utf_nextu2(&utf_ptr);                            
1717                 if (c == '/') c = '.';
1718
1719                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1720                 else fprintf(file, "?");
1721         }
1722 }
1723
1724
1725 /* is_valid_utf ****************************************************************
1726
1727    Return true if the given string is a valid UTF-8 string.
1728
1729    utf_ptr...points to first character
1730    end_pos...points after last character
1731
1732 *******************************************************************************/
1733
1734 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1735
1736 bool is_valid_utf(char *utf_ptr, char *end_pos)
1737 {
1738         int bytes;
1739         int len,i;
1740         char c;
1741         unsigned long v;
1742
1743         if (end_pos < utf_ptr) return false;
1744         bytes = end_pos - utf_ptr;
1745         while (bytes--) {
1746                 c = *utf_ptr++;
1747
1748                 if (!c) return false;                     /* 0x00 is not allowed */
1749                 if ((c & 0x80) == 0) continue;            /* ASCII */
1750
1751                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1752                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1753                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1754                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1755                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1756                 else return false;                        /* invalid leading byte */
1757
1758                 if (len > 2) return false;                /* Java limitation */
1759
1760                 v = (unsigned long)c & (0x3f >> len);
1761                 
1762                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1763
1764                 for (i = len; i--; ) {
1765                         c = *utf_ptr++;
1766                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1767                                 return false;
1768                         v = (v << 6) | (c & 0x3f);
1769                 }
1770
1771                 if (v == 0) {
1772                         if (len != 1) return false;           /* Java special */
1773
1774                 } else {
1775                         /* Sun Java seems to allow overlong UTF-8 encodings */
1776                         
1777                         /* if (v < min_codepoint[len]) */
1778                                 /* XXX throw exception? */
1779                 }
1780
1781                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1782                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1783
1784                 /* even these seem to be allowed */
1785                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1786         }
1787
1788         return true;
1789 }
1790
1791
1792 /* is_valid_name ***************************************************************
1793
1794    Return true if the given string may be used as a class/field/method
1795    name. (Currently this only disallows empty strings and control
1796    characters.)
1797
1798    NOTE: The string is assumed to have passed is_valid_utf!
1799
1800    utf_ptr...points to first character
1801    end_pos...points after last character
1802
1803 *******************************************************************************/
1804
1805 bool is_valid_name(char *utf_ptr, char *end_pos)
1806 {
1807         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1808
1809         while (utf_ptr < end_pos) {
1810                 unsigned char c = *utf_ptr++;
1811
1812                 if (c < 0x20) return false; /* disallow control characters */
1813                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1814                         return false;
1815         }
1816
1817         return true;
1818 }
1819
1820 bool is_valid_name_utf(utf *u)
1821 {
1822         return is_valid_name(u->text, UTF_END(u));
1823 }
1824
1825
1826 /* utf_show ********************************************************************
1827
1828    Writes the utf symbols in the utfhash to stdout and displays the
1829    number of external hash chains grouped according to the chainlength
1830    (for debugging purposes).
1831
1832 *******************************************************************************/
1833
1834 #if !defined(NDEBUG)
1835 void utf_show(void)
1836 {
1837
1838 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1839
1840         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1841         u4 max_chainlength = 0;      /* maximum length of the chains */
1842         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1843         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1844         u4 i;
1845
1846         printf("UTF-HASH:\n");
1847
1848         /* show element of utf-hashtable */
1849
1850         for (i = 0; i < hashtable_utf->size; i++) {
1851                 utf *u = hashtable_utf->ptr[i];
1852
1853                 if (u) {
1854                         printf("SLOT %d: ", (int) i);
1855
1856                         while (u) {
1857                                 printf("'");
1858                                 utf_display_printable_ascii(u);
1859                                 printf("' ");
1860                                 u = u->hashlink;
1861                         }       
1862                         printf("\n");
1863                 }
1864         }
1865
1866         printf("UTF-HASH: %d slots for %d entries\n", 
1867                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1868
1869         if (hashtable_utf->entries == 0)
1870                 return;
1871
1872         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1873
1874         for (i=0;i<CHAIN_LIMIT;i++)
1875                 chain_count[i]=0;
1876
1877         /* count numbers of hashchains according to their length */
1878         for (i=0; i<hashtable_utf->size; i++) {
1879                   
1880                 utf *u = (utf*) hashtable_utf->ptr[i];
1881                 u4 chain_length = 0;
1882
1883                 /* determine chainlength */
1884                 while (u) {
1885                         u = u->hashlink;
1886                         chain_length++;
1887                 }
1888
1889                 /* update sum of all chainlengths */
1890                 sum_chainlength+=chain_length;
1891
1892                 /* determine the maximum length of the chains */
1893                 if (chain_length>max_chainlength)
1894                         max_chainlength = chain_length;
1895
1896                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1897                 if (chain_length>=CHAIN_LIMIT) {
1898                         beyond_limit+=chain_length;
1899                         chain_length=CHAIN_LIMIT-1;
1900                 }
1901
1902                 /* update number of hashchains of current length */
1903                 chain_count[chain_length]++;
1904         }
1905
1906         /* display results */  
1907         for (i=1;i<CHAIN_LIMIT-1;i++) 
1908                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1909           
1910         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1911
1912
1913         printf("max. chainlength:%5d\n",max_chainlength);
1914
1915         /* avg. chainlength = sum of chainlengths / number of chains */
1916         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1917 }
1918 #endif /* !defined(NDEBUG) */
1919
1920
1921 /*
1922  * These are local overrides for various environment variables in Emacs.
1923  * Please do not remove this and leave it at the end of the file, where
1924  * Emacs will automagically detect them.
1925  * ---------------------------------------------------------------------
1926  * Local variables:
1927  * mode: c
1928  * indent-tabs-mode: t
1929  * c-basic-offset: 4
1930  * tab-width: 4
1931  * End:
1932  * vim:noexpandtab:sw=4:ts=4:
1933  */