09d691aed748ce3456591cae66015eee3ec9cb90
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25 */
26
27
28 #include "config.h"
29
30 #include <string.h>
31 #include <assert.h>
32
33 #include "vm/types.h"
34
35 #include "mm/memory.h"
36
37 #include "threads/lock-common.h"
38
39 #include "toolbox/hashtable.h"
40
41 #include "vm/exceptions.h"
42
43 #include "vmcore/options.h"
44
45 #if defined(ENABLE_STATISTICS)
46 # include "vmcore/statistics.h"
47 #endif
48
49 #include "vmcore/utf8.h"
50
51
52 /* global variables ***********************************************************/
53
54 /* hashsize must be power of 2 */
55
56 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
57
58 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
59
60
61 /* utf-symbols for pointer comparison of frequently used strings **************/
62
63 utf *utf_java_lang_Object;
64
65 utf *utf_java_lang_Class;
66 utf *utf_java_lang_ClassLoader;
67 utf *utf_java_lang_Cloneable;
68 utf *utf_java_lang_SecurityManager;
69 utf *utf_java_lang_String;
70 utf *utf_java_lang_System;
71 utf *utf_java_lang_ThreadGroup;
72 utf *utf_java_lang_ref_SoftReference;
73 utf *utf_java_lang_ref_WeakReference;
74 utf *utf_java_lang_ref_PhantomReference;
75 utf *utf_java_io_Serializable;
76
77 utf *utf_java_lang_Throwable;
78 utf *utf_java_lang_Error;
79
80 utf *utf_java_lang_AbstractMethodError;
81 utf *utf_java_lang_ClassCircularityError;
82 utf *utf_java_lang_ClassFormatError;
83 utf *utf_java_lang_ExceptionInInitializerError;
84 utf *utf_java_lang_IncompatibleClassChangeError;
85 utf *utf_java_lang_InstantiationError;
86 utf *utf_java_lang_InternalError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoClassDefFoundError;
89 utf *utf_java_lang_NoSuchFieldError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
92 utf *utf_java_lang_UnsatisfiedLinkError;
93 utf *utf_java_lang_UnsupportedClassVersionError;
94 utf *utf_java_lang_VerifyError;
95 utf *utf_java_lang_VirtualMachineError;
96
97 #if defined(WITH_CLASSPATH_GNU)
98 utf *utf_java_lang_VMThrowable;
99 #endif
100
101 utf *utf_java_lang_Exception;
102
103 utf *utf_java_lang_ArithmeticException;
104 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
105 utf *utf_java_lang_ArrayStoreException;
106 utf *utf_java_lang_ClassCastException;
107 utf *utf_java_lang_ClassNotFoundException;
108 utf *utf_java_lang_CloneNotSupportedException;
109 utf *utf_java_lang_IllegalAccessException;
110 utf *utf_java_lang_IllegalArgumentException;
111 utf *utf_java_lang_IllegalMonitorStateException;
112 utf *utf_java_lang_InstantiationException;
113 utf *utf_java_lang_InterruptedException;
114 utf *utf_java_lang_NegativeArraySizeException;
115 utf *utf_java_lang_NullPointerException;
116 utf *utf_java_lang_StringIndexOutOfBoundsException;
117
118 utf *utf_java_lang_reflect_InvocationTargetException;
119
120 utf *utf_java_security_PrivilegedActionException;
121
122 #if defined(ENABLE_JAVASE)
123 utf* utf_java_lang_Void;
124 #endif
125
126 utf* utf_java_lang_Boolean;
127 utf* utf_java_lang_Byte;
128 utf* utf_java_lang_Character;
129 utf* utf_java_lang_Short;
130 utf* utf_java_lang_Integer;
131 utf* utf_java_lang_Long;
132 utf* utf_java_lang_Float;
133 utf* utf_java_lang_Double;
134
135 #if defined(ENABLE_JAVASE)
136 utf *utf_java_lang_StackTraceElement;
137 utf *utf_java_lang_reflect_Constructor;
138 utf *utf_java_lang_reflect_Field;
139 utf *utf_java_lang_reflect_Method;
140 utf *utf_java_util_Vector;
141 #endif
142
143 utf *utf_InnerClasses;                  /* InnerClasses                       */
144 utf *utf_ConstantValue;                 /* ConstantValue                      */
145 utf *utf_Code;                          /* Code                               */
146 utf *utf_Exceptions;                    /* Exceptions                         */
147 utf *utf_LineNumberTable;               /* LineNumberTable                    */
148 utf *utf_SourceFile;                    /* SourceFile                         */
149
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
152 utf *utf_Signature;
153 utf *utf_StackMapTable;
154
155 #if defined(ENABLE_ANNOTATIONS)
156 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
157 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
158 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
159 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
160 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
161 #endif
162 #endif
163
164 utf *utf_init;                          /* <init>                             */
165 utf *utf_clinit;                        /* <clinit>                           */
166 utf *utf_clone;                         /* clone                              */
167 utf *utf_finalize;                      /* finalize                           */
168 utf *utf_run;                           /* run                                */
169
170 utf *utf_add;
171 utf *utf_remove;
172 utf *utf_addThread;
173 utf *utf_removeThread;
174 utf *utf_put;
175 utf *utf_get;
176 utf *utf_uncaughtException;
177 utf *utf_value;
178
179 utf *utf_fillInStackTrace;
180 utf *utf_findNative;
181 utf *utf_getSystemClassLoader;
182 utf *utf_initCause;
183 utf *utf_loadClass;
184 utf *utf_loadClassInternal;
185 utf *utf_printStackTrace;
186
187 utf *utf_division_by_zero;
188
189 utf *utf_Z;                             /* Z                                  */
190 utf *utf_B;                             /* B                                  */
191 utf *utf_C;                             /* C                                  */
192 utf *utf_S;                             /* S                                  */
193 utf *utf_I;                             /* I                                  */
194 utf *utf_J;                             /* J                                  */
195 utf *utf_F;                             /* F                                  */
196 utf *utf_D;                             /* D                                  */
197
198 utf *utf_void__void;                    /* ()V                                */
199 utf *utf_boolean__void;                 /* (Z)V                               */
200 utf *utf_byte__void;                    /* (B)V                               */
201 utf *utf_char__void;                    /* (C)V                               */
202 utf *utf_short__void;                   /* (S)V                               */
203 utf *utf_int__void;                     /* (I)V                               */
204 utf *utf_long__void;                    /* (J)V                               */
205 utf *utf_float__void;                   /* (F)V                               */
206 utf *utf_double__void;                  /* (D)V                               */
207
208 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
209 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
210 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
211 utf *utf_java_lang_ClassLoader_java_lang_String__J;
212 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
213 utf *utf_java_lang_Object__java_lang_Object;
214 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
215 utf *utf_java_lang_String__java_lang_Class;
216 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
217 utf *utf_java_lang_Thread_java_lang_Throwable__V;
218 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
219 utf *utf_java_lang_Throwable__java_lang_Throwable;
220
221 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
222 utf *utf_null;
223 utf *array_packagename;
224
225
226 /* utf_init ********************************************************************
227
228    Initializes the utf8 subsystem.
229
230 *******************************************************************************/
231
232 bool utf8_init(void)
233 {
234         /* create utf8 hashtable */
235
236         hashtable_utf = NEW(hashtable);
237
238         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
239
240 #if defined(ENABLE_STATISTICS)
241         if (opt_stat)
242                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
243 #endif
244
245         /* create utf-symbols for pointer comparison of frequently used strings */
246
247         utf_java_lang_Object           = utf_new_char("java/lang/Object");
248
249         utf_java_lang_Class            = utf_new_char("java/lang/Class");
250         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
251         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
252         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
253         utf_java_lang_String           = utf_new_char("java/lang/String");
254         utf_java_lang_System           = utf_new_char("java/lang/System");
255         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
256
257         utf_java_lang_ref_SoftReference =
258                 utf_new_char("java/lang/ref/SoftReference");
259
260         utf_java_lang_ref_WeakReference =
261                 utf_new_char("java/lang/ref/WeakReference");
262
263         utf_java_lang_ref_PhantomReference =
264                 utf_new_char("java/lang/ref/PhantomReference");
265
266         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
267
268         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
269         utf_java_lang_Error            = utf_new_char("java/lang/Error");
270
271         utf_java_lang_ClassCircularityError =
272                 utf_new_char("java/lang/ClassCircularityError");
273
274         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
275
276         utf_java_lang_ExceptionInInitializerError =
277                 utf_new_char("java/lang/ExceptionInInitializerError");
278
279         utf_java_lang_IncompatibleClassChangeError =
280                 utf_new_char("java/lang/IncompatibleClassChangeError");
281
282         utf_java_lang_InstantiationError =
283                 utf_new_char("java/lang/InstantiationError");
284
285         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
286         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
287
288         utf_java_lang_NoClassDefFoundError =
289                 utf_new_char("java/lang/NoClassDefFoundError");
290
291         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
292
293         utf_java_lang_UnsatisfiedLinkError =
294                 utf_new_char("java/lang/UnsatisfiedLinkError");
295
296         utf_java_lang_UnsupportedClassVersionError =
297                 utf_new_char("java/lang/UnsupportedClassVersionError");
298
299         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
300
301         utf_java_lang_VirtualMachineError =
302                 utf_new_char("java/lang/VirtualMachineError");
303
304 #if defined(ENABLE_JAVASE)
305         utf_java_lang_AbstractMethodError =
306                 utf_new_char("java/lang/AbstractMethodError");
307
308         utf_java_lang_NoSuchFieldError =
309                 utf_new_char("java/lang/NoSuchFieldError");
310
311         utf_java_lang_NoSuchMethodError =
312                 utf_new_char("java/lang/NoSuchMethodError");
313 #endif
314
315 #if defined(WITH_CLASSPATH_GNU)
316         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
317 #endif
318
319         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
320
321         utf_java_lang_ArithmeticException =
322                 utf_new_char("java/lang/ArithmeticException");
323
324         utf_java_lang_ArrayIndexOutOfBoundsException =
325                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
326
327         utf_java_lang_ArrayStoreException =
328                 utf_new_char("java/lang/ArrayStoreException");
329
330         utf_java_lang_ClassCastException =
331                 utf_new_char("java/lang/ClassCastException");
332
333         utf_java_lang_ClassNotFoundException =
334                 utf_new_char("java/lang/ClassNotFoundException");
335
336         utf_java_lang_CloneNotSupportedException =
337                 utf_new_char("java/lang/CloneNotSupportedException");
338
339         utf_java_lang_IllegalAccessException =
340                 utf_new_char("java/lang/IllegalAccessException");
341
342         utf_java_lang_IllegalArgumentException =
343                 utf_new_char("java/lang/IllegalArgumentException");
344
345         utf_java_lang_IllegalMonitorStateException =
346                 utf_new_char("java/lang/IllegalMonitorStateException");
347
348         utf_java_lang_InstantiationException =
349                 utf_new_char("java/lang/InstantiationException");
350
351         utf_java_lang_InterruptedException =
352                 utf_new_char("java/lang/InterruptedException");
353
354         utf_java_lang_NegativeArraySizeException =
355                 utf_new_char("java/lang/NegativeArraySizeException");
356
357         utf_java_lang_NullPointerException =
358                 utf_new_char("java/lang/NullPointerException");
359
360         utf_java_lang_StringIndexOutOfBoundsException =
361                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
362
363         utf_java_lang_reflect_InvocationTargetException =
364                 utf_new_char("java/lang/reflect/InvocationTargetException");
365
366         utf_java_security_PrivilegedActionException =
367                 utf_new_char("java/security/PrivilegedActionException");
368  
369 #if defined(ENABLE_JAVASE)
370         utf_java_lang_Void             = utf_new_char("java/lang/Void");
371 #endif
372
373         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
374         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
375         utf_java_lang_Character        = utf_new_char("java/lang/Character");
376         utf_java_lang_Short            = utf_new_char("java/lang/Short");
377         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
378         utf_java_lang_Long             = utf_new_char("java/lang/Long");
379         utf_java_lang_Float            = utf_new_char("java/lang/Float");
380         utf_java_lang_Double           = utf_new_char("java/lang/Double");
381
382 #if defined(ENABLE_JAVASE)
383         utf_java_lang_StackTraceElement =
384                 utf_new_char("java/lang/StackTraceElement");
385
386         utf_java_lang_reflect_Constructor =
387                 utf_new_char("java/lang/reflect/Constructor");
388
389         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
390         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
391         utf_java_util_Vector           = utf_new_char("java/util/Vector");
392 #endif
393
394         utf_InnerClasses               = utf_new_char("InnerClasses");
395         utf_ConstantValue              = utf_new_char("ConstantValue");
396         utf_Code                       = utf_new_char("Code");
397         utf_Exceptions                 = utf_new_char("Exceptions");
398         utf_LineNumberTable            = utf_new_char("LineNumberTable");
399         utf_SourceFile                 = utf_new_char("SourceFile");
400
401 #if defined(ENABLE_JAVASE)
402         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
403         utf_Signature                  = utf_new_char("Signature");
404         utf_StackMapTable              = utf_new_char("StackMapTable");
405
406 #if defined(ENABLE_ANNOTATIONS)
407         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
408         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
409         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
410         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
411         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
412 #endif
413 #endif
414
415         utf_init                           = utf_new_char("<init>");
416         utf_clinit                         = utf_new_char("<clinit>");
417         utf_clone                      = utf_new_char("clone");
418         utf_finalize                   = utf_new_char("finalize");
419         utf_run                        = utf_new_char("run");
420
421         utf_add                        = utf_new_char("add");
422         utf_remove                     = utf_new_char("remove");
423         utf_addThread                  = utf_new_char("addThread");
424         utf_removeThread               = utf_new_char("removeThread");
425         utf_put                        = utf_new_char("put");
426         utf_get                        = utf_new_char("get");
427         utf_uncaughtException          = utf_new_char("uncaughtException");
428         utf_value                      = utf_new_char("value");
429
430         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
431         utf_findNative                 = utf_new_char("findNative");
432         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
433         utf_initCause                  = utf_new_char("initCause");
434         utf_loadClass                  = utf_new_char("loadClass");
435         utf_loadClassInternal          = utf_new_char("loadClassInternal");
436         utf_printStackTrace            = utf_new_char("printStackTrace");
437
438         utf_division_by_zero           = utf_new_char("/ by zero");
439
440         utf_Z                          = utf_new_char("Z");
441         utf_B                          = utf_new_char("B");
442         utf_C                          = utf_new_char("C");
443         utf_S                          = utf_new_char("S");
444         utf_I                          = utf_new_char("I");
445         utf_J                          = utf_new_char("J");
446         utf_F                          = utf_new_char("F");
447         utf_D                          = utf_new_char("D");
448
449         utf_void__void                 = utf_new_char("()V");
450         utf_boolean__void              = utf_new_char("(Z)V");
451         utf_byte__void                 = utf_new_char("(B)V");
452         utf_char__void                 = utf_new_char("(C)V");
453         utf_short__void                = utf_new_char("(S)V");
454         utf_int__void                  = utf_new_char("(I)V");
455         utf_long__void                 = utf_new_char("(J)V");
456         utf_float__void                = utf_new_char("(F)V");
457         utf_double__void               = utf_new_char("(D)V");
458         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
459         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
460
461         utf_void__java_lang_ClassLoader =
462                 utf_new_char("()Ljava/lang/ClassLoader;");
463
464         utf_java_lang_ClassLoader_java_lang_String__J =
465                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
466
467         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
468
469         utf_java_lang_Object__java_lang_Object =
470                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
471
472         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
473
474         utf_java_lang_String__java_lang_Class =
475                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
476
477         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
478
479         utf_java_lang_Thread_java_lang_Throwable__V =
480                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
481
482         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
483
484         utf_java_lang_Throwable__java_lang_Throwable =
485                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
486
487         utf_null                       = utf_new_char("null");
488         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
489         array_packagename              = utf_new_char("\t<the array package>");
490
491         /* everything's ok */
492
493         return true;
494 }
495
496
497 /* utf_hashkey *****************************************************************
498
499    The hashkey is computed from the utf-text by using up to 8
500    characters.  For utf-symbols longer than 15 characters 3 characters
501    are taken from the beginning and the end, 2 characters are taken
502    from the middle.
503
504 *******************************************************************************/
505
506 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
507 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
508
509 u4 utf_hashkey(const char *text, u4 length)
510 {
511         const char *start_pos = text;       /* pointer to utf text                */
512         u4 a;
513
514         switch (length) {
515         case 0: /* empty string */
516                 return 0;
517
518         case 1: return fbs(0);
519         case 2: return fbs(0) ^ nbs(3);
520         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
521         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
522         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
523         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
524         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
525         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
526
527         case 9:
528                 a = fbs(0);
529                 a ^= nbs(1);
530                 a ^= nbs(2);
531                 text++;
532                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
533
534         case 10:
535                 a = fbs(0);
536                 text++;
537                 a ^= nbs(2);
538                 a ^= nbs(3);
539                 a ^= nbs(4);
540                 text++;
541                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
542
543         case 11:
544                 a = fbs(0);
545                 text++;
546                 a ^= nbs(2);
547                 a ^= nbs(3);
548                 a ^= nbs(4);
549                 text++;
550                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
551
552         case 12:
553                 a = fbs(0);
554                 text += 2;
555                 a ^= nbs(2);
556                 a ^= nbs(3);
557                 text++;
558                 a ^= nbs(5);
559                 a ^= nbs(6);
560                 a ^= nbs(7);
561                 text++;
562                 return a ^ nbs(9) ^ nbs(10);
563
564         case 13:
565                 a = fbs(0);
566                 a ^= nbs(1);
567                 text++;
568                 a ^= nbs(3);
569                 a ^= nbs(4);
570                 text += 2;      
571                 a ^= nbs(7);
572                 a ^= nbs(8);
573                 text += 2;
574                 return a ^ nbs(9) ^ nbs(10);
575
576         case 14:
577                 a = fbs(0);
578                 text += 2;      
579                 a ^= nbs(3);
580                 a ^= nbs(4);
581                 text += 2;      
582                 a ^= nbs(7);
583                 a ^= nbs(8);
584                 text += 2;
585                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
586
587         case 15:
588                 a = fbs(0);
589                 text += 2;      
590                 a ^= nbs(3);
591                 a ^= nbs(4);
592                 text += 2;      
593                 a ^= nbs(7);
594                 a ^= nbs(8);
595                 text += 2;
596                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
597
598         default:  /* 3 characters from beginning */
599                 a = fbs(0);
600                 text += 2;
601                 a ^= nbs(3);
602                 a ^= nbs(4);
603
604                 /* 2 characters from middle */
605                 text = start_pos + (length / 2);
606                 a ^= fbs(5);
607                 text += 2;
608                 a ^= nbs(6);    
609
610                 /* 3 characters from end */
611                 text = start_pos + length - 4;
612
613                 a ^= fbs(7);
614                 text++;
615
616                 return a ^ nbs(10) ^ nbs(11);
617     }
618 }
619
620 /* utf_full_hashkey ************************************************************
621
622    This function computes a hash value using all bytes in the string.
623
624    The algorithm is the "One-at-a-time" algorithm as published
625    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
626
627 *******************************************************************************/
628
629 u4 utf_full_hashkey(const char *text, u4 length)
630 {
631         register const unsigned char *p = (const unsigned char *) text;
632         register u4 hash;
633         register u4 i;
634
635         hash = 0;
636         for (i=length; i--;)
637         {
638             hash += *p++;
639             hash += (hash << 10);
640             hash ^= (hash >> 6);
641         }
642         hash += (hash << 3);
643         hash ^= (hash >> 11);
644         hash += (hash << 15);
645
646         return hash;
647 }
648
649 /* unicode_hashkey *************************************************************
650
651    Compute the hashkey of a unicode string.
652
653 *******************************************************************************/
654
655 u4 unicode_hashkey(u2 *text, u2 len)
656 {
657         return utf_hashkey((char *) text, len);
658 }
659
660
661 /* utf_new *********************************************************************
662
663    Creates a new utf-symbol, the text of the symbol is passed as a
664    u1-array. The function searches the utf-hashtable for a utf-symbol
665    with this text. On success the element returned, otherwise a new
666    hashtable element is created.
667
668    If the number of entries in the hashtable exceeds twice the size of
669    the hashtable slots a reorganization of the hashtable is done and
670    the utf symbols are copied to a new hashtable with doubled size.
671
672 *******************************************************************************/
673
674 utf *utf_new(const char *text, u2 length)
675 {
676         u4 key;                             /* hashkey computed from utf-text     */
677         u4 slot;                            /* slot in hashtable                  */
678         utf *u;                             /* hashtable element                  */
679         u2 i;
680
681         LOCK_MONITOR_ENTER(hashtable_utf->header);
682
683 #if defined(ENABLE_STATISTICS)
684         if (opt_stat)
685                 count_utf_new++;
686 #endif
687
688         key  = utf_hashkey(text, length);
689         slot = key & (hashtable_utf->size - 1);
690         u    = hashtable_utf->ptr[slot];
691
692         /* search external hash chain for utf-symbol */
693
694         while (u) {
695                 if (u->blength == length) {
696                         /* compare text of hashtable elements */
697
698                         for (i = 0; i < length; i++)
699                                 if (text[i] != u->text[i])
700                                         goto nomatch;
701                         
702 #if defined(ENABLE_STATISTICS)
703                         if (opt_stat)
704                                 count_utf_new_found++;
705 #endif
706
707                         /* symbol found in hashtable */
708
709                         LOCK_MONITOR_EXIT(hashtable_utf->header);
710
711                         return u;
712                 }
713
714         nomatch:
715                 u = u->hashlink; /* next element in external chain */
716         }
717
718         /* location in hashtable found, create new utf element */
719
720         u = NEW(utf);
721
722         u->blength  = length;               /* length in bytes of utfstring       */
723         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
724         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
725
726         memcpy(u->text, text, length);      /* copy utf-text                      */
727         u->text[length] = '\0';
728
729 #if defined(ENABLE_STATISTICS)
730         if (opt_stat)
731                 count_utf_len += sizeof(utf) + length + 1;
732 #endif
733
734         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
735         hashtable_utf->entries++;           /* update number of entries           */
736
737         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
738
739         /* reorganization of hashtable, average length of the external
740            chains is approx. 2 */
741
742                 hashtable *newhash;                              /* the new hashtable */
743                 u4         i;
744                 utf       *u;
745                 utf       *nextu;
746                 u4         slot;
747
748                 /* create new hashtable, double the size */
749
750                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
751
752 #if defined(ENABLE_STATISTICS)
753                 if (opt_stat)
754                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
755 #endif
756
757                 /* transfer elements to new hashtable */
758
759                 for (i = 0; i < hashtable_utf->size; i++) {
760                         u = hashtable_utf->ptr[i];
761
762                         while (u) {
763                                 nextu = u->hashlink;
764                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
765                                                 
766                                 u->hashlink = (utf *) newhash->ptr[slot];
767                                 newhash->ptr[slot] = u;
768
769                                 /* follow link in external hash chain */
770
771                                 u = nextu;
772                         }
773                 }
774         
775                 /* dispose old table */
776
777                 hashtable_free(hashtable_utf);
778
779                 hashtable_utf = newhash;
780         }
781
782         LOCK_MONITOR_EXIT(hashtable_utf->header);
783
784         return u;
785 }
786
787
788 /* utf_new_u2 ******************************************************************
789
790    Make utf symbol from u2 array, if isclassname is true '.' is
791    replaced by '/'.
792
793 *******************************************************************************/
794
795 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
796 {
797         char *buffer;                   /* memory buffer for  unicode characters  */
798         char *pos;                      /* pointer to current position in buffer  */
799         u4 left;                        /* unicode characters left                */
800         u4 buflength;                   /* utf length in bytes of the u2 array    */
801         utf *result;                    /* resulting utf-string                   */
802         int i;          
803
804         /* determine utf length in bytes and allocate memory */
805
806         buflength = u2_utflength(unicode_pos, unicode_length); 
807         buffer    = MNEW(char, buflength);
808  
809         left = buflength;
810         pos  = buffer;
811
812         for (i = 0; i++ < unicode_length; unicode_pos++) {
813                 /* next unicode character */
814                 u2 c = *unicode_pos;
815                 
816                 if ((c != 0) && (c < 0x80)) {
817                         /* 1 character */       
818                         left--;
819                 if ((int) left < 0) break;
820                         /* convert classname */
821                         if (isclassname && c == '.')
822                                 *pos++ = '/';
823                         else
824                                 *pos++ = (char) c;
825
826                 } else if (c < 0x800) {             
827                         /* 2 characters */                              
828                 unsigned char high = c >> 6;
829                 unsigned char low  = c & 0x3F;
830                         left = left - 2;
831                 if ((int) left < 0) break;
832                 *pos++ = high | 0xC0; 
833                 *pos++ = low  | 0x80;     
834
835                 } else {         
836                 /* 3 characters */                              
837                 char low  = c & 0x3f;
838                 char mid  = (c >> 6) & 0x3F;
839                 char high = c >> 12;
840                         left = left - 3;
841                 if ((int) left < 0) break;
842                 *pos++ = high | 0xE0; 
843                 *pos++ = mid  | 0x80;  
844                 *pos++ = low  | 0x80;   
845                 }
846         }
847         
848         /* insert utf-string into symbol-table */
849         result = utf_new(buffer,buflength);
850
851         MFREE(buffer, char, buflength);
852
853         return result;
854 }
855
856
857 /* utf_new_char ****************************************************************
858
859    Creates a new utf symbol, the text for this symbol is passed as a
860    c-string ( = char* ).
861
862 *******************************************************************************/
863
864 utf *utf_new_char(const char *text)
865 {
866         return utf_new(text, strlen(text));
867 }
868
869
870 /* utf_new_char_classname ******************************************************
871
872    Creates a new utf symbol, the text for this symbol is passed as a
873    c-string ( = char* ) "." characters are going to be replaced by
874    "/". Since the above function is used often, this is a separte
875    function, instead of an if.
876
877 *******************************************************************************/
878
879 utf *utf_new_char_classname(const char *text)
880 {
881         if (strchr(text, '.')) {
882                 char *txt = strdup(text);
883                 char *end = txt + strlen(txt);
884                 char *c;
885                 utf *tmpRes;
886
887                 for (c = txt; c < end; c++)
888                         if (*c == '.') *c = '/';
889
890                 tmpRes = utf_new(txt, strlen(txt));
891                 FREE(txt, 0);
892
893                 return tmpRes;
894
895         } else
896                 return utf_new(text, strlen(text));
897 }
898
899
900 /* utf_nextu2 ******************************************************************
901
902    Read the next unicode character from the utf string and increment
903    the utf-string pointer accordingly.
904
905    CAUTION: This function is unsafe for input that was not checked 
906             by is_valid_utf!
907
908 *******************************************************************************/
909
910 u2 utf_nextu2(char **utf_ptr)
911 {
912     /* uncompressed unicode character */
913     u2 unicode_char = 0;
914     /* current position in utf text */  
915     unsigned char *utf = (unsigned char *) (*utf_ptr);
916     /* bytes representing the unicode character */
917     unsigned char ch1, ch2, ch3;
918     /* number of bytes used to represent the unicode character */
919     int len = 0;
920         
921     switch ((ch1 = utf[0]) >> 4) {
922         default: /* 1 byte */
923                 (*utf_ptr)++;
924                 return (u2) ch1;
925         case 0xC: 
926         case 0xD: /* 2 bytes */
927                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
928                         unsigned char high = ch1 & 0x1F;
929                         unsigned char low  = ch2 & 0x3F;
930                         unicode_char = (high << 6) + low;
931                         len = 2;
932                 }
933                 break;
934
935         case 0xE: /* 2 or 3 bytes */
936                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
937                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
938                                 unsigned char low  = ch3 & 0x3f;
939                                 unsigned char mid  = ch2 & 0x3f;
940                                 unsigned char high = ch1 & 0x0f;
941                                 unicode_char = (((high << 6) + mid) << 6) + low;
942                                 len = 3;
943                         } else
944                                 len = 2;                                           
945                 }
946                 break;
947     }
948
949     /* update position in utf-text */
950     *utf_ptr = (char *) (utf + len);
951
952     return unicode_char;
953 }
954
955
956 /* utf_bytes *******************************************************************
957
958    Determine number of bytes (aka. octets) in the utf string.
959
960    IN:
961       u............utf string
962
963    OUT:
964       The number of octets of this utf string.
965           There is _no_ terminating zero included in this count.
966
967 *******************************************************************************/
968
969 u4 utf_bytes(utf *u)
970 {
971         return u->blength;
972 }
973
974
975 /* utf_get_number_of_u2s_for_buffer ********************************************
976
977    Determine number of UTF-16 u2s in the given UTF-8 buffer
978
979    CAUTION: This function is unsafe for input that was not checked 
980             by is_valid_utf!
981
982    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
983    to an array of u2s (UTF-16) and want to know how many of them you will get.
984    All other uses of this function are probably wrong.
985
986    IN:
987       buffer........points to first char in buffer
988           blength.......number of _bytes_ in the buffer
989
990    OUT:
991       the number of u2s needed to hold this string in UTF-16 encoding.
992           There is _no_ terminating zero included in this count.
993
994    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
995    exception.
996
997 *******************************************************************************/
998
999 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1000 {
1001         const char *endpos;                 /* points behind utf string           */
1002         const char *utf_ptr;                /* current position in utf text       */
1003         u4 len = 0;                         /* number of unicode characters       */
1004
1005         utf_ptr = buffer;
1006         endpos = utf_ptr + blength;
1007
1008         while (utf_ptr < endpos) {
1009                 len++;
1010                 /* next unicode character */
1011                 utf_nextu2((char **)&utf_ptr);
1012         }
1013
1014         assert(utf_ptr == endpos);
1015
1016         return len;
1017 }
1018
1019
1020 /* utf_get_number_of_u2s *******************************************************
1021
1022    Determine number of UTF-16 u2s in the utf string.
1023
1024    CAUTION: This function is unsafe for input that was not checked 
1025             by is_valid_utf!
1026
1027    CAUTION: Use this function *only* when you want to convert a utf string
1028    to an array of u2s and want to know how many of them you will get.
1029    All other uses of this function are probably wrong.
1030
1031    IN:
1032       u............utf string
1033
1034    OUT:
1035       the number of u2s needed to hold this string in UTF-16 encoding.
1036           There is _no_ terminating zero included in this count.
1037           XXX 0 if a NullPointerException has been thrown (see below)
1038
1039 *******************************************************************************/
1040
1041 u4 utf_get_number_of_u2s(utf *u)
1042 {
1043         char *endpos;                       /* points behind utf string           */
1044         char *utf_ptr;                      /* current position in utf text       */
1045         u4 len = 0;                         /* number of unicode characters       */
1046
1047         /* XXX this is probably not checked by most callers! Review this after */
1048         /* the invalid uses of this function have been eliminated */
1049         if (u == NULL) {
1050                 exceptions_throw_nullpointerexception();
1051                 return 0;
1052         }
1053
1054         endpos = UTF_END(u);
1055         utf_ptr = u->text;
1056
1057         while (utf_ptr < endpos) {
1058                 len++;
1059                 /* next unicode character */
1060                 utf_nextu2(&utf_ptr);
1061         }
1062
1063         if (utf_ptr != endpos) {
1064                 /* string ended abruptly */
1065                 exceptions_throw_internalerror("Illegal utf8 string");
1066                 return 0;
1067         }
1068
1069         return len;
1070 }
1071
1072
1073 /* utf8_safe_number_of_u2s *****************************************************
1074
1075    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1076    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1077
1078    This function is safe even for invalid UTF-8 strings.
1079
1080    IN:
1081       text..........zero-terminated(!) UTF-8 string (may be invalid)
1082                         must NOT be NULL
1083           nbytes........strlen(text). (This is needed to completely emulate
1084                         the RI).
1085
1086    OUT:
1087       the number of u2s needed to hold this string in UTF-16 encoding.
1088           There is _no_ terminating zero included in this count.
1089
1090 *******************************************************************************/
1091
1092 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1093         register const unsigned char *t;
1094         register s4 byte;
1095         register s4 len;
1096         register const unsigned char *tlimit;
1097         s4 byte1;
1098         s4 byte2;
1099         s4 byte3;
1100         s4 value;
1101         s4 skip;
1102
1103         assert(text);
1104         assert(nbytes >= 0);
1105
1106         len = 0;
1107         t = (const unsigned char *) text;
1108         tlimit = t + nbytes;
1109
1110         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1111
1112         while (1) {
1113                 byte = *t++;
1114
1115                 if (byte & 0x80) {
1116                         /* highest bit set, non-ASCII character */
1117
1118                         if ((byte & 0xe0) == 0xc0) {
1119                                 /* 2-byte: should be 110..... 10...... ? */
1120
1121                                 if ((*t++ & 0xc0) == 0x80)
1122                                         ; /* valid 2-byte */
1123                                 else
1124                                         t--; /* invalid */
1125                         }
1126                         else if ((byte & 0xf0) == 0xe0) {
1127                                 /* 3-byte: should be 1110.... 10...... 10...... */
1128                                 /*                            ^t                */
1129
1130                                 if (t + 2 > tlimit)
1131                                         return len + 1; /* invalid, stop here */
1132
1133                                 if ((*t++ & 0xc0) == 0x80) {
1134                                         if ((*t++ & 0xc0) == 0x80)
1135                                                 ; /* valid 3-byte */
1136                                         else
1137                                                 t--; /* invalid */
1138                                 }
1139                                 else
1140                                         t--; /* invalid */
1141                         }
1142                         else if ((byte & 0xf8) == 0xf0) {
1143                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1144                                 /*                            ^t                         */
1145
1146                                 if (t + 3 > tlimit)
1147                                         return len + 1; /* invalid, stop here */
1148
1149                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1150                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1151                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1152                                                         /* valid 4-byte UTF-8? */
1153                                                         value = ((byte  & 0x07) << 18)
1154                                                                   | ((byte1 & 0x3f) << 12)
1155                                                                   | ((byte2 & 0x3f) <<  6)
1156                                                                   | ((byte3 & 0x3f)      );
1157
1158                                                         if (value > 0x10FFFF)
1159                                                                 ; /* invalid */
1160                                                         else if (value > 0xFFFF)
1161                                                                 len += 1; /* we need surrogates */
1162                                                         else
1163                                                                 ; /* 16bit suffice */
1164                                                 }
1165                                                 else
1166                                                         t--; /* invalid */
1167                                         }
1168                                         else
1169                                                 t--; /* invalid */
1170                                 }
1171                                 else
1172                                         t--; /* invalid */
1173                         }
1174                         else if ((byte & 0xfc) == 0xf8) {
1175                                 /* invalid 5-byte */
1176                                 if (t + 4 > tlimit)
1177                                         return len + 1; /* invalid, stop here */
1178
1179                                 skip = 4;
1180                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1181                                         t++;
1182                         }
1183                         else if ((byte & 0xfe) == 0xfc) {
1184                                 /* invalid 6-byte */
1185                                 if (t + 5 > tlimit)
1186                                         return len + 1; /* invalid, stop here */
1187
1188                                 skip = 5;
1189                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1190                                         t++;
1191                         }
1192                         else
1193                                 ; /* invalid */
1194                 }
1195                 else {
1196                         /* NUL */
1197
1198                         if (byte == 0)
1199                                 break;
1200
1201                         /* ASCII character, common case */
1202                 }
1203
1204                 len++;
1205         }
1206
1207         return len;
1208 }
1209
1210
1211 /* utf8_safe_convert_to_u2s ****************************************************
1212
1213    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1214    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1215    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1216
1217    This function is safe even for invalid UTF-8 strings.
1218
1219    IN:
1220       text..........zero-terminated(!) UTF-8 string (may be invalid)
1221                         must NOT be NULL
1222           nbytes........strlen(text). (This is needed to completely emulate
1223                                         the RI).
1224           buffer........a preallocated array of u2s to receive the decoded
1225                         string. Use utf8_safe_number_of_u2s to get the
1226                                         required number of u2s for allocating this.
1227
1228 *******************************************************************************/
1229
1230 #define UNICODE_REPLACEMENT  0xfffd
1231
1232 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1233         register const unsigned char *t;
1234         register s4 byte;
1235         register const unsigned char *tlimit;
1236         s4 byte1;
1237         s4 byte2;
1238         s4 byte3;
1239         s4 value;
1240         s4 skip;
1241
1242         assert(text);
1243         assert(nbytes >= 0);
1244
1245         t = (const unsigned char *) text;
1246         tlimit = t + nbytes;
1247
1248         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1249
1250         while (1) {
1251                 byte = *t++;
1252
1253                 if (byte & 0x80) {
1254                         /* highest bit set, non-ASCII character */
1255
1256                         if ((byte & 0xe0) == 0xc0) {
1257                                 /* 2-byte: should be 110..... 10...... */
1258
1259                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1260                                         /* valid 2-byte UTF-8 */
1261                                         *buffer++ = ((byte  & 0x1f) << 6)
1262                                                           | ((byte1 & 0x3f)     );
1263                                 }
1264                                 else {
1265                                         *buffer++ = UNICODE_REPLACEMENT;
1266                                         t--;
1267                                 }
1268                         }
1269                         else if ((byte & 0xf0) == 0xe0) {
1270                                 /* 3-byte: should be 1110.... 10...... 10...... */
1271
1272                                 if (t + 2 > tlimit) {
1273                                         *buffer++ = UNICODE_REPLACEMENT;
1274                                         return;
1275                                 }
1276
1277                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1278                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1279                                                 /* valid 3-byte UTF-8 */
1280                                                 *buffer++ = ((byte  & 0x0f) << 12)
1281                                                                   | ((byte1 & 0x3f) <<  6)
1282                                                                   | ((byte2 & 0x3f)      );
1283                                         }
1284                                         else {
1285                                                 *buffer++ = UNICODE_REPLACEMENT;
1286                                                 t--;
1287                                         }
1288                                 }
1289                                 else {
1290                                         *buffer++ = UNICODE_REPLACEMENT;
1291                                         t--;
1292                                 }
1293                         }
1294                         else if ((byte & 0xf8) == 0xf0) {
1295                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1296
1297                                 if (t + 3 > tlimit) {
1298                                         *buffer++ = UNICODE_REPLACEMENT;
1299                                         return;
1300                                 }
1301
1302                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1303                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1304                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1305                                                         /* valid 4-byte UTF-8? */
1306                                                         value = ((byte  & 0x07) << 18)
1307                                                                   | ((byte1 & 0x3f) << 12)
1308                                                                   | ((byte2 & 0x3f) <<  6)
1309                                                                   | ((byte3 & 0x3f)      );
1310
1311                                                         if (value > 0x10FFFF) {
1312                                                                 *buffer++ = UNICODE_REPLACEMENT;
1313                                                         }
1314                                                         else if (value > 0xFFFF) {
1315                                                                 /* we need surrogates */
1316                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1317                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1318                                                         }
1319                                                         else
1320                                                                 *buffer++ = value; /* 16bit suffice */
1321                                                 }
1322                                                 else {
1323                                                         *buffer++ = UNICODE_REPLACEMENT;
1324                                                         t--;
1325                                                 }
1326                                         }
1327                                         else {
1328                                                 *buffer++ = UNICODE_REPLACEMENT;
1329                                                 t--;
1330                                         }
1331                                 }
1332                                 else {
1333                                         *buffer++ = UNICODE_REPLACEMENT;
1334                                         t--;
1335                                 }
1336                         }
1337                         else if ((byte & 0xfc) == 0xf8) {
1338                                 if (t + 4 > tlimit) {
1339                                         *buffer++ = UNICODE_REPLACEMENT;
1340                                         return;
1341                                 }
1342
1343                                 skip = 4;
1344                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1345                                         t++;
1346                                 *buffer++ = UNICODE_REPLACEMENT;
1347                         }
1348                         else if ((byte & 0xfe) == 0xfc) {
1349                                 if (t + 5 > tlimit) {
1350                                         *buffer++ = UNICODE_REPLACEMENT;
1351                                         return;
1352                                 }
1353
1354                                 skip = 5;
1355                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1356                                         t++;
1357                                 *buffer++ = UNICODE_REPLACEMENT;
1358                         }
1359                         else
1360                                 *buffer++ = UNICODE_REPLACEMENT;
1361                 }
1362                 else {
1363                         /* NUL */
1364
1365                         if (byte == 0)
1366                                 break;
1367
1368                         /* ASCII character, common case */
1369
1370                         *buffer++ = byte;
1371                 }
1372         }
1373 }
1374
1375
1376 /* u2_utflength ****************************************************************
1377
1378    Returns the utf length in bytes of a u2 array.
1379
1380 *******************************************************************************/
1381
1382 u4 u2_utflength(u2 *text, u4 u2_length)
1383 {
1384         u4 result_len = 0;                  /* utf length in bytes                */
1385         u2 ch;                              /* current unicode character          */
1386         u4 len;
1387         
1388         for (len = 0; len < u2_length; len++) {
1389                 /* next unicode character */
1390                 ch = *text++;
1391           
1392                 /* determine bytes required to store unicode character as utf */
1393                 if (ch && (ch < 0x80)) 
1394                         result_len++;
1395                 else if (ch < 0x800)
1396                         result_len += 2;        
1397                 else 
1398                         result_len += 3;        
1399         }
1400
1401     return result_len;
1402 }
1403
1404
1405 /* utf_copy ********************************************************************
1406
1407    Copy the given utf string byte-for-byte to a buffer.
1408
1409    IN:
1410       buffer.......the buffer
1411           u............the utf string
1412
1413 *******************************************************************************/
1414
1415 void utf_copy(char *buffer, utf *u)
1416 {
1417         /* our utf strings are zero-terminated (done by utf_new) */
1418         MCOPY(buffer, u->text, char, u->blength + 1);
1419 }
1420
1421
1422 /* utf_cat *********************************************************************
1423
1424    Append the given utf string byte-for-byte to a buffer.
1425
1426    IN:
1427       buffer.......the buffer
1428           u............the utf string
1429
1430 *******************************************************************************/
1431
1432 void utf_cat(char *buffer, utf *u)
1433 {
1434         /* our utf strings are zero-terminated (done by utf_new) */
1435         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1436 }
1437
1438
1439 /* utf_copy_classname **********************************************************
1440
1441    Copy the given utf classname byte-for-byte to a buffer.
1442    '/' is replaced by '.'
1443
1444    IN:
1445       buffer.......the buffer
1446           u............the utf string
1447
1448 *******************************************************************************/
1449
1450 void utf_copy_classname(char *buffer, utf *u)
1451 {
1452         char *bufptr;
1453         char *srcptr;
1454         char *endptr;
1455         char ch;
1456
1457         bufptr = buffer;
1458         srcptr = u->text;
1459         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1460
1461         while (srcptr != endptr) {
1462                 ch = *srcptr++;
1463                 if (ch == '/')
1464                         ch = '.';
1465                 *bufptr++ = ch;
1466         }
1467 }
1468
1469
1470 /* utf_cat *********************************************************************
1471
1472    Append the given utf classname byte-for-byte to a buffer.
1473    '/' is replaced by '.'
1474
1475    IN:
1476       buffer.......the buffer
1477           u............the utf string
1478
1479 *******************************************************************************/
1480
1481 void utf_cat_classname(char *buffer, utf *u)
1482 {
1483         utf_copy_classname(buffer + strlen(buffer), u);
1484 }
1485
1486 /* utf_display_printable_ascii *************************************************
1487
1488    Write utf symbol to stdout (for debugging purposes).
1489    Non-printable and non-ASCII characters are printed as '?'.
1490
1491 *******************************************************************************/
1492
1493 void utf_display_printable_ascii(utf *u)
1494 {
1495         char *endpos;                       /* points behind utf string           */
1496         char *utf_ptr;                      /* current position in utf text       */
1497
1498         if (u == NULL) {
1499                 printf("NULL");
1500                 fflush(stdout);
1501                 return;
1502         }
1503
1504         endpos = UTF_END(u);
1505         utf_ptr = u->text;
1506
1507         while (utf_ptr < endpos) {
1508                 /* read next unicode character */
1509
1510                 u2 c = utf_nextu2(&utf_ptr);
1511
1512                 if ((c >= 32) && (c <= 127))
1513                         printf("%c", c);
1514                 else
1515                         printf("?");
1516         }
1517
1518         fflush(stdout);
1519 }
1520
1521
1522 /* utf_display_printable_ascii_classname ***************************************
1523
1524    Write utf symbol to stdout with `/' converted to `.' (for debugging
1525    purposes).
1526    Non-printable and non-ASCII characters are printed as '?'.
1527
1528 *******************************************************************************/
1529
1530 void utf_display_printable_ascii_classname(utf *u)
1531 {
1532         char *endpos;                       /* points behind utf string           */
1533         char *utf_ptr;                      /* current position in utf text       */
1534
1535         if (u == NULL) {
1536                 printf("NULL");
1537                 fflush(stdout);
1538                 return;
1539         }
1540
1541         endpos = UTF_END(u);
1542         utf_ptr = u->text;
1543
1544         while (utf_ptr < endpos) {
1545                 /* read next unicode character */
1546
1547                 u2 c = utf_nextu2(&utf_ptr);
1548
1549                 if (c == '/')
1550                         c = '.';
1551
1552                 if ((c >= 32) && (c <= 127))
1553                         printf("%c", c);
1554                 else
1555                         printf("?");
1556         }
1557
1558         fflush(stdout);
1559 }
1560
1561
1562 /* utf_sprint_convert_to_latin1 ************************************************
1563         
1564    Write utf symbol into c-string (for debugging purposes).
1565    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1566    invalid results.
1567
1568 *******************************************************************************/
1569
1570 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1571 {
1572         char *endpos;                       /* points behind utf string           */
1573         char *utf_ptr;                      /* current position in utf text       */
1574         u2 pos = 0;                         /* position in c-string               */
1575
1576         if (!u) {
1577                 strcpy(buffer, "NULL");
1578                 return;
1579         }
1580
1581         endpos = UTF_END(u);
1582         utf_ptr = u->text;
1583
1584         while (utf_ptr < endpos) 
1585                 /* copy next unicode character */       
1586                 buffer[pos++] = utf_nextu2(&utf_ptr);
1587
1588         /* terminate string */
1589         buffer[pos] = '\0';
1590 }
1591
1592
1593 /* utf_sprint_convert_to_latin1_classname **************************************
1594         
1595    Write utf symbol into c-string with `/' converted to `.' (for debugging
1596    purposes).
1597    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1598    invalid results.
1599
1600 *******************************************************************************/
1601
1602 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1603 {
1604         char *endpos;                       /* points behind utf string           */
1605         char *utf_ptr;                      /* current position in utf text       */
1606         u2 pos = 0;                         /* position in c-string               */
1607
1608         if (!u) {
1609                 strcpy(buffer, "NULL");
1610                 return;
1611         }
1612
1613         endpos = UTF_END(u);
1614         utf_ptr = u->text;
1615
1616         while (utf_ptr < endpos) {
1617                 /* copy next unicode character */       
1618                 u2 c = utf_nextu2(&utf_ptr);
1619                 if (c == '/') c = '.';
1620                 buffer[pos++] = c;
1621         }
1622
1623         /* terminate string */
1624         buffer[pos] = '\0';
1625 }
1626
1627
1628 /* utf_strcat_convert_to_latin1 ************************************************
1629         
1630    Like libc strcat, but uses an utf8 string.
1631    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1632    invalid results.
1633
1634 *******************************************************************************/
1635
1636 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1637 {
1638         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1639 }
1640
1641
1642 /* utf_strcat_convert_to_latin1_classname **************************************
1643         
1644    Like libc strcat, but uses an utf8 string.
1645    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1646    invalid results.
1647
1648 *******************************************************************************/
1649
1650 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1651 {
1652         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1653 }
1654
1655
1656 /* utf_fprint_printable_ascii **************************************************
1657         
1658    Write utf symbol into file.
1659    Non-printable and non-ASCII characters are printed as '?'.
1660
1661 *******************************************************************************/
1662
1663 void utf_fprint_printable_ascii(FILE *file, utf *u)
1664 {
1665         char *endpos;                       /* points behind utf string           */
1666         char *utf_ptr;                      /* current position in utf text       */
1667
1668         if (!u)
1669                 return;
1670
1671         endpos = UTF_END(u);
1672         utf_ptr = u->text;
1673
1674         while (utf_ptr < endpos) { 
1675                 /* read next unicode character */                
1676                 u2 c = utf_nextu2(&utf_ptr);                            
1677
1678                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1679                 else fprintf(file, "?");
1680         }
1681 }
1682
1683
1684 /* utf_fprint_printable_ascii_classname ****************************************
1685         
1686    Write utf symbol into file with `/' converted to `.'.
1687    Non-printable and non-ASCII characters are printed as '?'.
1688
1689 *******************************************************************************/
1690
1691 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1692 {
1693         char *endpos;                       /* points behind utf string           */
1694         char *utf_ptr;                      /* current position in utf text       */
1695
1696     if (!u)
1697                 return;
1698
1699         endpos = UTF_END(u);
1700         utf_ptr = u->text;
1701
1702         while (utf_ptr < endpos) { 
1703                 /* read next unicode character */                
1704                 u2 c = utf_nextu2(&utf_ptr);                            
1705                 if (c == '/') c = '.';
1706
1707                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1708                 else fprintf(file, "?");
1709         }
1710 }
1711
1712
1713 /* is_valid_utf ****************************************************************
1714
1715    Return true if the given string is a valid UTF-8 string.
1716
1717    utf_ptr...points to first character
1718    end_pos...points after last character
1719
1720 *******************************************************************************/
1721
1722 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1723
1724 bool is_valid_utf(char *utf_ptr, char *end_pos)
1725 {
1726         int bytes;
1727         int len,i;
1728         char c;
1729         unsigned long v;
1730
1731         if (end_pos < utf_ptr) return false;
1732         bytes = end_pos - utf_ptr;
1733         while (bytes--) {
1734                 c = *utf_ptr++;
1735
1736                 if (!c) return false;                     /* 0x00 is not allowed */
1737                 if ((c & 0x80) == 0) continue;            /* ASCII */
1738
1739                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1740                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1741                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1742                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1743                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1744                 else return false;                        /* invalid leading byte */
1745
1746                 if (len > 2) return false;                /* Java limitation */
1747
1748                 v = (unsigned long)c & (0x3f >> len);
1749                 
1750                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1751
1752                 for (i = len; i--; ) {
1753                         c = *utf_ptr++;
1754                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1755                                 return false;
1756                         v = (v << 6) | (c & 0x3f);
1757                 }
1758
1759                 if (v == 0) {
1760                         if (len != 1) return false;           /* Java special */
1761
1762                 } else {
1763                         /* Sun Java seems to allow overlong UTF-8 encodings */
1764                         
1765                         /* if (v < min_codepoint[len]) */
1766                                 /* XXX throw exception? */
1767                 }
1768
1769                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1770                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1771
1772                 /* even these seem to be allowed */
1773                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1774         }
1775
1776         return true;
1777 }
1778
1779
1780 /* is_valid_name ***************************************************************
1781
1782    Return true if the given string may be used as a class/field/method
1783    name. (Currently this only disallows empty strings and control
1784    characters.)
1785
1786    NOTE: The string is assumed to have passed is_valid_utf!
1787
1788    utf_ptr...points to first character
1789    end_pos...points after last character
1790
1791 *******************************************************************************/
1792
1793 bool is_valid_name(char *utf_ptr, char *end_pos)
1794 {
1795         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1796
1797         while (utf_ptr < end_pos) {
1798                 unsigned char c = *utf_ptr++;
1799
1800                 if (c < 0x20) return false; /* disallow control characters */
1801                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1802                         return false;
1803         }
1804
1805         return true;
1806 }
1807
1808 bool is_valid_name_utf(utf *u)
1809 {
1810         return is_valid_name(u->text, UTF_END(u));
1811 }
1812
1813
1814 /* utf_show ********************************************************************
1815
1816    Writes the utf symbols in the utfhash to stdout and displays the
1817    number of external hash chains grouped according to the chainlength
1818    (for debugging purposes).
1819
1820 *******************************************************************************/
1821
1822 #if !defined(NDEBUG)
1823 void utf_show(void)
1824 {
1825
1826 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1827
1828         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1829         u4 max_chainlength = 0;      /* maximum length of the chains */
1830         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1831         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1832         u4 i;
1833
1834         printf("UTF-HASH:\n");
1835
1836         /* show element of utf-hashtable */
1837
1838         for (i = 0; i < hashtable_utf->size; i++) {
1839                 utf *u = hashtable_utf->ptr[i];
1840
1841                 if (u) {
1842                         printf("SLOT %d: ", (int) i);
1843
1844                         while (u) {
1845                                 printf("'");
1846                                 utf_display_printable_ascii(u);
1847                                 printf("' ");
1848                                 u = u->hashlink;
1849                         }       
1850                         printf("\n");
1851                 }
1852         }
1853
1854         printf("UTF-HASH: %d slots for %d entries\n", 
1855                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1856
1857         if (hashtable_utf->entries == 0)
1858                 return;
1859
1860         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1861
1862         for (i=0;i<CHAIN_LIMIT;i++)
1863                 chain_count[i]=0;
1864
1865         /* count numbers of hashchains according to their length */
1866         for (i=0; i<hashtable_utf->size; i++) {
1867                   
1868                 utf *u = (utf*) hashtable_utf->ptr[i];
1869                 u4 chain_length = 0;
1870
1871                 /* determine chainlength */
1872                 while (u) {
1873                         u = u->hashlink;
1874                         chain_length++;
1875                 }
1876
1877                 /* update sum of all chainlengths */
1878                 sum_chainlength+=chain_length;
1879
1880                 /* determine the maximum length of the chains */
1881                 if (chain_length>max_chainlength)
1882                         max_chainlength = chain_length;
1883
1884                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1885                 if (chain_length>=CHAIN_LIMIT) {
1886                         beyond_limit+=chain_length;
1887                         chain_length=CHAIN_LIMIT-1;
1888                 }
1889
1890                 /* update number of hashchains of current length */
1891                 chain_count[chain_length]++;
1892         }
1893
1894         /* display results */  
1895         for (i=1;i<CHAIN_LIMIT-1;i++) 
1896                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1897           
1898         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1899
1900
1901         printf("max. chainlength:%5d\n",max_chainlength);
1902
1903         /* avg. chainlength = sum of chainlengths / number of chains */
1904         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1905 }
1906 #endif /* !defined(NDEBUG) */
1907
1908
1909 /*
1910  * These are local overrides for various environment variables in Emacs.
1911  * Please do not remove this and leave it at the end of the file, where
1912  * Emacs will automagically detect them.
1913  * ---------------------------------------------------------------------
1914  * Local variables:
1915  * mode: c
1916  * indent-tabs-mode: t
1917  * c-basic-offset: 4
1918  * tab-width: 4
1919  * End:
1920  * vim:noexpandtab:sw=4:ts=4:
1921  */