fcb587d2d184f7a3f73c20a44666fc4ea5a3df3f
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    $Id: utf8.c 8367 2007-08-20 20:26:16Z twisti $
26
27 */
28
29
30 #include "config.h"
31
32 #include <string.h>
33 #include <assert.h>
34
35 #include "vm/types.h"
36
37 #include "mm/memory.h"
38
39 #include "threads/lock-common.h"
40
41 #include "toolbox/hashtable.h"
42
43 #include "vm/exceptions.h"
44
45 #include "vmcore/options.h"
46
47 #if defined(ENABLE_STATISTICS)
48 # include "vmcore/statistics.h"
49 #endif
50
51 #include "vmcore/utf8.h"
52
53
54 /* global variables ***********************************************************/
55
56 /* hashsize must be power of 2 */
57
58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
59
60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
61
62
63 /* utf-symbols for pointer comparison of frequently used strings **************/
64
65 utf *utf_java_lang_Object;
66
67 utf *utf_java_lang_Class;
68 utf *utf_java_lang_ClassLoader;
69 utf *utf_java_lang_Cloneable;
70 utf *utf_java_lang_SecurityManager;
71 utf *utf_java_lang_String;
72 utf *utf_java_lang_System;
73 utf *utf_java_lang_ThreadGroup;
74 utf *utf_java_lang_ref_SoftReference;
75 utf *utf_java_lang_ref_WeakReference;
76 utf *utf_java_lang_ref_PhantomReference;
77 utf *utf_java_io_Serializable;
78
79 utf *utf_java_lang_Throwable;
80 utf *utf_java_lang_Error;
81
82 utf *utf_java_lang_AbstractMethodError;
83 utf *utf_java_lang_ClassCircularityError;
84 utf *utf_java_lang_ClassFormatError;
85 utf *utf_java_lang_ExceptionInInitializerError;
86 utf *utf_java_lang_IncompatibleClassChangeError;
87 utf *utf_java_lang_InstantiationError;
88 utf *utf_java_lang_InternalError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchFieldError;
92 utf *utf_java_lang_NoSuchMethodError;
93 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_UnsatisfiedLinkError;
95 utf *utf_java_lang_UnsupportedClassVersionError;
96 utf *utf_java_lang_VerifyError;
97 utf *utf_java_lang_VirtualMachineError;
98
99 #if defined(WITH_CLASSPATH_GNU)
100 utf *utf_java_lang_VMThrowable;
101 #endif
102
103 utf *utf_java_lang_Exception;
104
105 utf *utf_java_lang_ArithmeticException;
106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
107 utf *utf_java_lang_ArrayStoreException;
108 utf *utf_java_lang_ClassCastException;
109 utf *utf_java_lang_ClassNotFoundException;
110 utf *utf_java_lang_CloneNotSupportedException;
111 utf *utf_java_lang_IllegalAccessException;
112 utf *utf_java_lang_IllegalArgumentException;
113 utf *utf_java_lang_IllegalMonitorStateException;
114 utf *utf_java_lang_InstantiationException;
115 utf *utf_java_lang_InterruptedException;
116 utf *utf_java_lang_NegativeArraySizeException;
117 utf *utf_java_lang_NullPointerException;
118 utf *utf_java_lang_StringIndexOutOfBoundsException;
119
120 utf *utf_java_lang_reflect_InvocationTargetException;
121
122 utf *utf_java_security_PrivilegedActionException;
123
124 #if defined(ENABLE_JAVASE)
125 utf* utf_java_lang_Void;
126 #endif
127
128 utf* utf_java_lang_Boolean;
129 utf* utf_java_lang_Byte;
130 utf* utf_java_lang_Character;
131 utf* utf_java_lang_Short;
132 utf* utf_java_lang_Integer;
133 utf* utf_java_lang_Long;
134 utf* utf_java_lang_Float;
135 utf* utf_java_lang_Double;
136
137 #if defined(ENABLE_JAVASE)
138 utf *utf_java_lang_StackTraceElement;
139 utf *utf_java_lang_reflect_Constructor;
140 utf *utf_java_lang_reflect_Field;
141 utf *utf_java_lang_reflect_Method;
142 utf *utf_java_util_Vector;
143 #endif
144
145 utf *utf_InnerClasses;                  /* InnerClasses                       */
146 utf *utf_ConstantValue;                 /* ConstantValue                      */
147 utf *utf_Code;                          /* Code                               */
148 utf *utf_Exceptions;                    /* Exceptions                         */
149 utf *utf_LineNumberTable;               /* LineNumberTable                    */
150 utf *utf_SourceFile;                    /* SourceFile                         */
151
152 #if defined(ENABLE_JAVASE)
153 utf *utf_EnclosingMethod;
154 utf *utf_Signature;
155 utf *utf_StackMapTable;
156
157 #if defined(ENABLE_ANNOTATIONS)
158 utf *utf_sun_reflect_ConstantPool;
159 #if defined(WITH_CLASSPATH_GNU)
160 utf *utf_sun_reflect_annotation_AnnotationParser;
161 #endif
162
163 utf *utf_RuntimeVisibleAnnotations;
164 utf *utf_RuntimeInvisibleAnnotations;
165 utf *utf_RuntimeVisibleParameterAnnotations;
166 utf *utf_RuntimeInvisibleParameterAnnotations;
167 utf *utf_AnnotationDefault;
168 #endif
169 #endif
170
171 utf *utf_init;                          /* <init>                             */
172 utf *utf_clinit;                        /* <clinit>                           */
173 utf *utf_clone;                         /* clone                              */
174 utf *utf_finalize;                      /* finalize                           */
175 utf *utf_run;                           /* run                                */
176
177 utf *utf_add;
178 utf *utf_remove;
179 utf *utf_addThread;
180 utf *utf_removeThread;
181 utf *utf_put;
182 utf *utf_get;
183 utf *utf_uncaughtException;
184 utf *utf_value;
185
186 utf *utf_fillInStackTrace;
187 utf *utf_findNative;
188 utf *utf_getSystemClassLoader;
189 utf *utf_initCause;
190 utf *utf_loadClass;
191 utf *utf_printStackTrace;
192
193 utf *utf_division_by_zero;
194
195 utf *utf_Z;                             /* Z                                  */
196 utf *utf_B;                             /* B                                  */
197 utf *utf_C;                             /* C                                  */
198 utf *utf_S;                             /* S                                  */
199 utf *utf_I;                             /* I                                  */
200 utf *utf_J;                             /* J                                  */
201 utf *utf_F;                             /* F                                  */
202 utf *utf_D;                             /* D                                  */
203
204 utf *utf_void__void;                    /* ()V                                */
205 utf *utf_boolean__void;                 /* (Z)V                               */
206 utf *utf_byte__void;                    /* (B)V                               */
207 utf *utf_char__void;                    /* (C)V                               */
208 utf *utf_short__void;                   /* (S)V                               */
209 utf *utf_int__void;                     /* (I)V                               */
210 utf *utf_long__void;                    /* (J)V                               */
211 utf *utf_float__void;                   /* (F)V                               */
212 utf *utf_double__void;                  /* (D)V                               */
213
214 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
215 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
216 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
217 utf *utf_java_lang_ClassLoader_java_lang_String__J;
218 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
219 utf *utf_java_lang_Object__java_lang_Object;
220 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
221 utf *utf_java_lang_String__java_lang_Class;
222 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
223 utf *utf_java_lang_Thread_java_lang_Throwable__V;
224 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
225 utf *utf_java_lang_Throwable__java_lang_Throwable;
226
227 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
228 utf *utf_null;
229 utf *array_packagename;
230
231
232 /* utf_init ********************************************************************
233
234    Initializes the utf8 subsystem.
235
236 *******************************************************************************/
237
238 bool utf8_init(void)
239 {
240         /* create utf8 hashtable */
241
242         hashtable_utf = NEW(hashtable);
243
244         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
245
246 #if defined(ENABLE_STATISTICS)
247         if (opt_stat)
248                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
249 #endif
250
251         /* create utf-symbols for pointer comparison of frequently used strings */
252
253         utf_java_lang_Object           = utf_new_char("java/lang/Object");
254
255         utf_java_lang_Class            = utf_new_char("java/lang/Class");
256         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
257         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
258         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
259         utf_java_lang_String           = utf_new_char("java/lang/String");
260         utf_java_lang_System           = utf_new_char("java/lang/System");
261         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
262
263         utf_java_lang_ref_SoftReference =
264                 utf_new_char("java/lang/ref/SoftReference");
265
266         utf_java_lang_ref_WeakReference =
267                 utf_new_char("java/lang/ref/WeakReference");
268
269         utf_java_lang_ref_PhantomReference =
270                 utf_new_char("java/lang/ref/PhantomReference");
271
272         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
273
274         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
275         utf_java_lang_Error            = utf_new_char("java/lang/Error");
276
277         utf_java_lang_ClassCircularityError =
278                 utf_new_char("java/lang/ClassCircularityError");
279
280         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
281
282         utf_java_lang_ExceptionInInitializerError =
283                 utf_new_char("java/lang/ExceptionInInitializerError");
284
285         utf_java_lang_IncompatibleClassChangeError =
286                 utf_new_char("java/lang/IncompatibleClassChangeError");
287
288         utf_java_lang_InstantiationError =
289                 utf_new_char("java/lang/InstantiationError");
290
291         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
292         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
293
294         utf_java_lang_NoClassDefFoundError =
295                 utf_new_char("java/lang/NoClassDefFoundError");
296
297         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
298
299         utf_java_lang_UnsatisfiedLinkError =
300                 utf_new_char("java/lang/UnsatisfiedLinkError");
301
302         utf_java_lang_UnsupportedClassVersionError =
303                 utf_new_char("java/lang/UnsupportedClassVersionError");
304
305         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
306
307         utf_java_lang_VirtualMachineError =
308                 utf_new_char("java/lang/VirtualMachineError");
309
310 #if defined(ENABLE_JAVASE)
311         utf_java_lang_AbstractMethodError =
312                 utf_new_char("java/lang/AbstractMethodError");
313
314         utf_java_lang_NoSuchFieldError =
315                 utf_new_char("java/lang/NoSuchFieldError");
316
317         utf_java_lang_NoSuchMethodError =
318                 utf_new_char("java/lang/NoSuchMethodError");
319 #endif
320
321 #if defined(WITH_CLASSPATH_GNU)
322         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
323 #endif
324
325         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
326
327         utf_java_lang_ArithmeticException =
328                 utf_new_char("java/lang/ArithmeticException");
329
330         utf_java_lang_ArrayIndexOutOfBoundsException =
331                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
332
333         utf_java_lang_ArrayStoreException =
334                 utf_new_char("java/lang/ArrayStoreException");
335
336         utf_java_lang_ClassCastException =
337                 utf_new_char("java/lang/ClassCastException");
338
339         utf_java_lang_ClassNotFoundException =
340                 utf_new_char("java/lang/ClassNotFoundException");
341
342         utf_java_lang_CloneNotSupportedException =
343                 utf_new_char("java/lang/CloneNotSupportedException");
344
345         utf_java_lang_IllegalAccessException =
346                 utf_new_char("java/lang/IllegalAccessException");
347
348         utf_java_lang_IllegalArgumentException =
349                 utf_new_char("java/lang/IllegalArgumentException");
350
351         utf_java_lang_IllegalMonitorStateException =
352                 utf_new_char("java/lang/IllegalMonitorStateException");
353
354         utf_java_lang_InstantiationException =
355                 utf_new_char("java/lang/InstantiationException");
356
357         utf_java_lang_InterruptedException =
358                 utf_new_char("java/lang/InterruptedException");
359
360         utf_java_lang_NegativeArraySizeException =
361                 utf_new_char("java/lang/NegativeArraySizeException");
362
363         utf_java_lang_NullPointerException =
364                 utf_new_char("java/lang/NullPointerException");
365
366         utf_java_lang_StringIndexOutOfBoundsException =
367                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
368
369         utf_java_lang_reflect_InvocationTargetException =
370                 utf_new_char("java/lang/reflect/InvocationTargetException");
371
372         utf_java_security_PrivilegedActionException =
373                 utf_new_char("java/security/PrivilegedActionException");
374  
375 #if defined(ENABLE_JAVASE)
376         utf_java_lang_Void             = utf_new_char("java/lang/Void");
377 #endif
378
379         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
380         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
381         utf_java_lang_Character        = utf_new_char("java/lang/Character");
382         utf_java_lang_Short            = utf_new_char("java/lang/Short");
383         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
384         utf_java_lang_Long             = utf_new_char("java/lang/Long");
385         utf_java_lang_Float            = utf_new_char("java/lang/Float");
386         utf_java_lang_Double           = utf_new_char("java/lang/Double");
387
388 #if defined(ENABLE_JAVASE)
389         utf_java_lang_StackTraceElement =
390                 utf_new_char("java/lang/StackTraceElement");
391
392         utf_java_lang_reflect_Constructor =
393                 utf_new_char("java/lang/reflect/Constructor");
394
395         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
396         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
397         utf_java_util_Vector           = utf_new_char("java/util/Vector");
398 #endif
399
400         utf_InnerClasses               = utf_new_char("InnerClasses");
401         utf_ConstantValue              = utf_new_char("ConstantValue");
402         utf_Code                       = utf_new_char("Code");
403         utf_Exceptions                 = utf_new_char("Exceptions");
404         utf_LineNumberTable            = utf_new_char("LineNumberTable");
405         utf_SourceFile                 = utf_new_char("SourceFile");
406
407 #if defined(ENABLE_JAVASE)
408         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
409         utf_Signature                  = utf_new_char("Signature");
410         utf_StackMapTable              = utf_new_char("StackMapTable");
411
412 #if defined(ENABLE_ANNOTATIONS)
413         utf_sun_reflect_ConstantPool                = utf_new_char("sun/reflect/ConstantPool");
414 #if defined(WITH_CLASSPATH_GNU)
415         utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
416 #endif
417
418         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
419         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
420         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
421         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
422         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
423 #endif
424 #endif
425
426         utf_init                           = utf_new_char("<init>");
427         utf_clinit                         = utf_new_char("<clinit>");
428         utf_clone                      = utf_new_char("clone");
429         utf_finalize                   = utf_new_char("finalize");
430         utf_run                        = utf_new_char("run");
431
432         utf_add                        = utf_new_char("add");
433         utf_remove                     = utf_new_char("remove");
434         utf_addThread                  = utf_new_char("addThread");
435         utf_removeThread               = utf_new_char("removeThread");
436         utf_put                        = utf_new_char("put");
437         utf_get                        = utf_new_char("get");
438         utf_uncaughtException          = utf_new_char("uncaughtException");
439         utf_value                      = utf_new_char("value");
440
441         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
442         utf_findNative                 = utf_new_char("findNative");
443         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
444         utf_initCause                  = utf_new_char("initCause");
445         utf_loadClass                  = utf_new_char("loadClass");
446         utf_printStackTrace            = utf_new_char("printStackTrace");
447
448         utf_division_by_zero           = utf_new_char("/ by zero");
449
450         utf_Z                          = utf_new_char("Z");
451         utf_B                          = utf_new_char("B");
452         utf_C                          = utf_new_char("C");
453         utf_S                          = utf_new_char("S");
454         utf_I                          = utf_new_char("I");
455         utf_J                          = utf_new_char("J");
456         utf_F                          = utf_new_char("F");
457         utf_D                          = utf_new_char("D");
458
459         utf_void__void                 = utf_new_char("()V");
460         utf_boolean__void              = utf_new_char("(Z)V");
461         utf_byte__void                 = utf_new_char("(B)V");
462         utf_char__void                 = utf_new_char("(C)V");
463         utf_short__void                = utf_new_char("(S)V");
464         utf_int__void                  = utf_new_char("(I)V");
465         utf_long__void                 = utf_new_char("(J)V");
466         utf_float__void                = utf_new_char("(F)V");
467         utf_double__void               = utf_new_char("(D)V");
468         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
469         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
470
471         utf_void__java_lang_ClassLoader =
472                 utf_new_char("()Ljava/lang/ClassLoader;");
473
474         utf_java_lang_ClassLoader_java_lang_String__J =
475                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
476
477         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
478
479         utf_java_lang_Object__java_lang_Object =
480                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
481
482         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
483
484         utf_java_lang_String__java_lang_Class =
485                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
486
487         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
488
489         utf_java_lang_Thread_java_lang_Throwable__V =
490                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
491
492         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
493
494         utf_java_lang_Throwable__java_lang_Throwable =
495                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
496
497         utf_null                       = utf_new_char("null");
498         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
499         array_packagename              = utf_new_char("\t<the array package>");
500
501         /* everything's ok */
502
503         return true;
504 }
505
506
507 /* utf_hashkey *****************************************************************
508
509    The hashkey is computed from the utf-text by using up to 8
510    characters.  For utf-symbols longer than 15 characters 3 characters
511    are taken from the beginning and the end, 2 characters are taken
512    from the middle.
513
514 *******************************************************************************/
515
516 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
517 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
518
519 u4 utf_hashkey(const char *text, u4 length)
520 {
521         const char *start_pos = text;       /* pointer to utf text                */
522         u4 a;
523
524         switch (length) {
525         case 0: /* empty string */
526                 return 0;
527
528         case 1: return fbs(0);
529         case 2: return fbs(0) ^ nbs(3);
530         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
531         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
532         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
533         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
534         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
535         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
536
537         case 9:
538                 a = fbs(0);
539                 a ^= nbs(1);
540                 a ^= nbs(2);
541                 text++;
542                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
543
544         case 10:
545                 a = fbs(0);
546                 text++;
547                 a ^= nbs(2);
548                 a ^= nbs(3);
549                 a ^= nbs(4);
550                 text++;
551                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
552
553         case 11:
554                 a = fbs(0);
555                 text++;
556                 a ^= nbs(2);
557                 a ^= nbs(3);
558                 a ^= nbs(4);
559                 text++;
560                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
561
562         case 12:
563                 a = fbs(0);
564                 text += 2;
565                 a ^= nbs(2);
566                 a ^= nbs(3);
567                 text++;
568                 a ^= nbs(5);
569                 a ^= nbs(6);
570                 a ^= nbs(7);
571                 text++;
572                 return a ^ nbs(9) ^ nbs(10);
573
574         case 13:
575                 a = fbs(0);
576                 a ^= nbs(1);
577                 text++;
578                 a ^= nbs(3);
579                 a ^= nbs(4);
580                 text += 2;      
581                 a ^= nbs(7);
582                 a ^= nbs(8);
583                 text += 2;
584                 return a ^ nbs(9) ^ nbs(10);
585
586         case 14:
587                 a = fbs(0);
588                 text += 2;      
589                 a ^= nbs(3);
590                 a ^= nbs(4);
591                 text += 2;      
592                 a ^= nbs(7);
593                 a ^= nbs(8);
594                 text += 2;
595                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
596
597         case 15:
598                 a = fbs(0);
599                 text += 2;      
600                 a ^= nbs(3);
601                 a ^= nbs(4);
602                 text += 2;      
603                 a ^= nbs(7);
604                 a ^= nbs(8);
605                 text += 2;
606                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
607
608         default:  /* 3 characters from beginning */
609                 a = fbs(0);
610                 text += 2;
611                 a ^= nbs(3);
612                 a ^= nbs(4);
613
614                 /* 2 characters from middle */
615                 text = start_pos + (length / 2);
616                 a ^= fbs(5);
617                 text += 2;
618                 a ^= nbs(6);    
619
620                 /* 3 characters from end */
621                 text = start_pos + length - 4;
622
623                 a ^= fbs(7);
624                 text++;
625
626                 return a ^ nbs(10) ^ nbs(11);
627     }
628 }
629
630 /* utf_full_hashkey ************************************************************
631
632    This function computes a hash value using all bytes in the string.
633
634    The algorithm is the "One-at-a-time" algorithm as published
635    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
636
637 *******************************************************************************/
638
639 u4 utf_full_hashkey(const char *text, u4 length)
640 {
641         register const unsigned char *p = (const unsigned char *) text;
642         register u4 hash;
643         register u4 i;
644
645         hash = 0;
646         for (i=length; i--;)
647         {
648             hash += *p++;
649             hash += (hash << 10);
650             hash ^= (hash >> 6);
651         }
652         hash += (hash << 3);
653         hash ^= (hash >> 11);
654         hash += (hash << 15);
655
656         return hash;
657 }
658
659 /* unicode_hashkey *************************************************************
660
661    Compute the hashkey of a unicode string.
662
663 *******************************************************************************/
664
665 u4 unicode_hashkey(u2 *text, u2 len)
666 {
667         return utf_hashkey((char *) text, len);
668 }
669
670
671 /* utf_new *********************************************************************
672
673    Creates a new utf-symbol, the text of the symbol is passed as a
674    u1-array. The function searches the utf-hashtable for a utf-symbol
675    with this text. On success the element returned, otherwise a new
676    hashtable element is created.
677
678    If the number of entries in the hashtable exceeds twice the size of
679    the hashtable slots a reorganization of the hashtable is done and
680    the utf symbols are copied to a new hashtable with doubled size.
681
682 *******************************************************************************/
683
684 utf *utf_new(const char *text, u2 length)
685 {
686         u4 key;                             /* hashkey computed from utf-text     */
687         u4 slot;                            /* slot in hashtable                  */
688         utf *u;                             /* hashtable element                  */
689         u2 i;
690
691         LOCK_MONITOR_ENTER(hashtable_utf->header);
692
693 #if defined(ENABLE_STATISTICS)
694         if (opt_stat)
695                 count_utf_new++;
696 #endif
697
698         key  = utf_hashkey(text, length);
699         slot = key & (hashtable_utf->size - 1);
700         u    = hashtable_utf->ptr[slot];
701
702         /* search external hash chain for utf-symbol */
703
704         while (u) {
705                 if (u->blength == length) {
706                         /* compare text of hashtable elements */
707
708                         for (i = 0; i < length; i++)
709                                 if (text[i] != u->text[i])
710                                         goto nomatch;
711                         
712 #if defined(ENABLE_STATISTICS)
713                         if (opt_stat)
714                                 count_utf_new_found++;
715 #endif
716
717                         /* symbol found in hashtable */
718
719                         LOCK_MONITOR_EXIT(hashtable_utf->header);
720
721                         return u;
722                 }
723
724         nomatch:
725                 u = u->hashlink; /* next element in external chain */
726         }
727
728         /* location in hashtable found, create new utf element */
729
730         u = NEW(utf);
731
732         u->blength  = length;               /* length in bytes of utfstring       */
733         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
734         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
735
736         memcpy(u->text, text, length);      /* copy utf-text                      */
737         u->text[length] = '\0';
738
739 #if defined(ENABLE_STATISTICS)
740         if (opt_stat)
741                 count_utf_len += sizeof(utf) + length + 1;
742 #endif
743
744         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
745         hashtable_utf->entries++;           /* update number of entries           */
746
747         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
748
749         /* reorganization of hashtable, average length of the external
750            chains is approx. 2 */
751
752                 hashtable *newhash;                              /* the new hashtable */
753                 u4         i;
754                 utf       *u;
755                 utf       *nextu;
756                 u4         slot;
757
758                 /* create new hashtable, double the size */
759
760                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
761
762 #if defined(ENABLE_STATISTICS)
763                 if (opt_stat)
764                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
765 #endif
766
767                 /* transfer elements to new hashtable */
768
769                 for (i = 0; i < hashtable_utf->size; i++) {
770                         u = hashtable_utf->ptr[i];
771
772                         while (u) {
773                                 nextu = u->hashlink;
774                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
775                                                 
776                                 u->hashlink = (utf *) newhash->ptr[slot];
777                                 newhash->ptr[slot] = u;
778
779                                 /* follow link in external hash chain */
780
781                                 u = nextu;
782                         }
783                 }
784         
785                 /* dispose old table */
786
787                 hashtable_free(hashtable_utf);
788
789                 hashtable_utf = newhash;
790         }
791
792         LOCK_MONITOR_EXIT(hashtable_utf->header);
793
794         return u;
795 }
796
797
798 /* utf_new_u2 ******************************************************************
799
800    Make utf symbol from u2 array, if isclassname is true '.' is
801    replaced by '/'.
802
803 *******************************************************************************/
804
805 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
806 {
807         char *buffer;                   /* memory buffer for  unicode characters  */
808         char *pos;                      /* pointer to current position in buffer  */
809         u4 left;                        /* unicode characters left                */
810         u4 buflength;                   /* utf length in bytes of the u2 array    */
811         utf *result;                    /* resulting utf-string                   */
812         int i;          
813
814         /* determine utf length in bytes and allocate memory */
815
816         buflength = u2_utflength(unicode_pos, unicode_length); 
817         buffer    = MNEW(char, buflength);
818  
819         left = buflength;
820         pos  = buffer;
821
822         for (i = 0; i++ < unicode_length; unicode_pos++) {
823                 /* next unicode character */
824                 u2 c = *unicode_pos;
825                 
826                 if ((c != 0) && (c < 0x80)) {
827                         /* 1 character */       
828                         left--;
829                 if ((int) left < 0) break;
830                         /* convert classname */
831                         if (isclassname && c == '.')
832                                 *pos++ = '/';
833                         else
834                                 *pos++ = (char) c;
835
836                 } else if (c < 0x800) {             
837                         /* 2 characters */                              
838                 unsigned char high = c >> 6;
839                 unsigned char low  = c & 0x3F;
840                         left = left - 2;
841                 if ((int) left < 0) break;
842                 *pos++ = high | 0xC0; 
843                 *pos++ = low  | 0x80;     
844
845                 } else {         
846                 /* 3 characters */                              
847                 char low  = c & 0x3f;
848                 char mid  = (c >> 6) & 0x3F;
849                 char high = c >> 12;
850                         left = left - 3;
851                 if ((int) left < 0) break;
852                 *pos++ = high | 0xE0; 
853                 *pos++ = mid  | 0x80;  
854                 *pos++ = low  | 0x80;   
855                 }
856         }
857         
858         /* insert utf-string into symbol-table */
859         result = utf_new(buffer,buflength);
860
861         MFREE(buffer, char, buflength);
862
863         return result;
864 }
865
866
867 /* utf_new_char ****************************************************************
868
869    Creates a new utf symbol, the text for this symbol is passed as a
870    c-string ( = char* ).
871
872 *******************************************************************************/
873
874 utf *utf_new_char(const char *text)
875 {
876         return utf_new(text, strlen(text));
877 }
878
879
880 /* utf_new_char_classname ******************************************************
881
882    Creates a new utf symbol, the text for this symbol is passed as a
883    c-string ( = char* ) "." characters are going to be replaced by
884    "/". Since the above function is used often, this is a separte
885    function, instead of an if.
886
887 *******************************************************************************/
888
889 utf *utf_new_char_classname(const char *text)
890 {
891         if (strchr(text, '.')) {
892                 char *txt = strdup(text);
893                 char *end = txt + strlen(txt);
894                 char *c;
895                 utf *tmpRes;
896
897                 for (c = txt; c < end; c++)
898                         if (*c == '.') *c = '/';
899
900                 tmpRes = utf_new(txt, strlen(txt));
901                 FREE(txt, 0);
902
903                 return tmpRes;
904
905         } else
906                 return utf_new(text, strlen(text));
907 }
908
909
910 /* utf_nextu2 ******************************************************************
911
912    Read the next unicode character from the utf string and increment
913    the utf-string pointer accordingly.
914
915    CAUTION: This function is unsafe for input that was not checked 
916             by is_valid_utf!
917
918 *******************************************************************************/
919
920 u2 utf_nextu2(char **utf_ptr)
921 {
922     /* uncompressed unicode character */
923     u2 unicode_char = 0;
924     /* current position in utf text */  
925     unsigned char *utf = (unsigned char *) (*utf_ptr);
926     /* bytes representing the unicode character */
927     unsigned char ch1, ch2, ch3;
928     /* number of bytes used to represent the unicode character */
929     int len = 0;
930         
931     switch ((ch1 = utf[0]) >> 4) {
932         default: /* 1 byte */
933                 (*utf_ptr)++;
934                 return (u2) ch1;
935         case 0xC: 
936         case 0xD: /* 2 bytes */
937                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
938                         unsigned char high = ch1 & 0x1F;
939                         unsigned char low  = ch2 & 0x3F;
940                         unicode_char = (high << 6) + low;
941                         len = 2;
942                 }
943                 break;
944
945         case 0xE: /* 2 or 3 bytes */
946                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
947                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
948                                 unsigned char low  = ch3 & 0x3f;
949                                 unsigned char mid  = ch2 & 0x3f;
950                                 unsigned char high = ch1 & 0x0f;
951                                 unicode_char = (((high << 6) + mid) << 6) + low;
952                                 len = 3;
953                         } else
954                                 len = 2;                                           
955                 }
956                 break;
957     }
958
959     /* update position in utf-text */
960     *utf_ptr = (char *) (utf + len);
961
962     return unicode_char;
963 }
964
965
966 /* utf_bytes *******************************************************************
967
968    Determine number of bytes (aka. octets) in the utf string.
969
970    IN:
971       u............utf string
972
973    OUT:
974       The number of octets of this utf string.
975           There is _no_ terminating zero included in this count.
976
977 *******************************************************************************/
978
979 u4 utf_bytes(utf *u)
980 {
981         return u->blength;
982 }
983
984
985 /* utf_get_number_of_u2s_for_buffer ********************************************
986
987    Determine number of UTF-16 u2s in the given UTF-8 buffer
988
989    CAUTION: This function is unsafe for input that was not checked 
990             by is_valid_utf!
991
992    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
993    to an array of u2s (UTF-16) and want to know how many of them you will get.
994    All other uses of this function are probably wrong.
995
996    IN:
997       buffer........points to first char in buffer
998           blength.......number of _bytes_ in the buffer
999
1000    OUT:
1001       the number of u2s needed to hold this string in UTF-16 encoding.
1002           There is _no_ terminating zero included in this count.
1003
1004    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1005    exception.
1006
1007 *******************************************************************************/
1008
1009 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1010 {
1011         const char *endpos;                 /* points behind utf string           */
1012         const char *utf_ptr;                /* current position in utf text       */
1013         u4 len = 0;                         /* number of unicode characters       */
1014
1015         utf_ptr = buffer;
1016         endpos = utf_ptr + blength;
1017
1018         while (utf_ptr < endpos) {
1019                 len++;
1020                 /* next unicode character */
1021                 utf_nextu2((char **)&utf_ptr);
1022         }
1023
1024         assert(utf_ptr == endpos);
1025
1026         return len;
1027 }
1028
1029
1030 /* utf_get_number_of_u2s *******************************************************
1031
1032    Determine number of UTF-16 u2s in the utf string.
1033
1034    CAUTION: This function is unsafe for input that was not checked 
1035             by is_valid_utf!
1036
1037    CAUTION: Use this function *only* when you want to convert a utf string
1038    to an array of u2s and want to know how many of them you will get.
1039    All other uses of this function are probably wrong.
1040
1041    IN:
1042       u............utf string
1043
1044    OUT:
1045       the number of u2s needed to hold this string in UTF-16 encoding.
1046           There is _no_ terminating zero included in this count.
1047           XXX 0 if a NullPointerException has been thrown (see below)
1048
1049 *******************************************************************************/
1050
1051 u4 utf_get_number_of_u2s(utf *u)
1052 {
1053         char *endpos;                       /* points behind utf string           */
1054         char *utf_ptr;                      /* current position in utf text       */
1055         u4 len = 0;                         /* number of unicode characters       */
1056
1057         /* XXX this is probably not checked by most callers! Review this after */
1058         /* the invalid uses of this function have been eliminated */
1059         if (u == NULL) {
1060                 exceptions_throw_nullpointerexception();
1061                 return 0;
1062         }
1063
1064         endpos = UTF_END(u);
1065         utf_ptr = u->text;
1066
1067         while (utf_ptr < endpos) {
1068                 len++;
1069                 /* next unicode character */
1070                 utf_nextu2(&utf_ptr);
1071         }
1072
1073         if (utf_ptr != endpos) {
1074                 /* string ended abruptly */
1075                 exceptions_throw_internalerror("Illegal utf8 string");
1076                 return 0;
1077         }
1078
1079         return len;
1080 }
1081
1082
1083 /* utf8_safe_number_of_u2s *****************************************************
1084
1085    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1086    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1087
1088    This function is safe even for invalid UTF-8 strings.
1089
1090    IN:
1091       text..........zero-terminated(!) UTF-8 string (may be invalid)
1092                         must NOT be NULL
1093           nbytes........strlen(text). (This is needed to completely emulate
1094                         the RI).
1095
1096    OUT:
1097       the number of u2s needed to hold this string in UTF-16 encoding.
1098           There is _no_ terminating zero included in this count.
1099
1100 *******************************************************************************/
1101
1102 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1103         register const unsigned char *t;
1104         register s4 byte;
1105         register s4 len;
1106         register const unsigned char *tlimit;
1107         s4 byte1;
1108         s4 byte2;
1109         s4 byte3;
1110         s4 value;
1111         s4 skip;
1112
1113         assert(text);
1114         assert(nbytes >= 0);
1115
1116         len = 0;
1117         t = (const unsigned char *) text;
1118         tlimit = t + nbytes;
1119
1120         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1121
1122         while (1) {
1123                 byte = *t++;
1124
1125                 if (byte & 0x80) {
1126                         /* highest bit set, non-ASCII character */
1127
1128                         if ((byte & 0xe0) == 0xc0) {
1129                                 /* 2-byte: should be 110..... 10...... ? */
1130
1131                                 if ((*t++ & 0xc0) == 0x80)
1132                                         ; /* valid 2-byte */
1133                                 else
1134                                         t--; /* invalid */
1135                         }
1136                         else if ((byte & 0xf0) == 0xe0) {
1137                                 /* 3-byte: should be 1110.... 10...... 10...... */
1138                                 /*                            ^t                */
1139
1140                                 if (t + 2 > tlimit)
1141                                         return len + 1; /* invalid, stop here */
1142
1143                                 if ((*t++ & 0xc0) == 0x80) {
1144                                         if ((*t++ & 0xc0) == 0x80)
1145                                                 ; /* valid 3-byte */
1146                                         else
1147                                                 t--; /* invalid */
1148                                 }
1149                                 else
1150                                         t--; /* invalid */
1151                         }
1152                         else if ((byte & 0xf8) == 0xf0) {
1153                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1154                                 /*                            ^t                         */
1155
1156                                 if (t + 3 > tlimit)
1157                                         return len + 1; /* invalid, stop here */
1158
1159                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1160                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1161                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1162                                                         /* valid 4-byte UTF-8? */
1163                                                         value = ((byte  & 0x07) << 18)
1164                                                                   | ((byte1 & 0x3f) << 12)
1165                                                                   | ((byte2 & 0x3f) <<  6)
1166                                                                   | ((byte3 & 0x3f)      );
1167
1168                                                         if (value > 0x10FFFF)
1169                                                                 ; /* invalid */
1170                                                         else if (value > 0xFFFF)
1171                                                                 len += 1; /* we need surrogates */
1172                                                         else
1173                                                                 ; /* 16bit suffice */
1174                                                 }
1175                                                 else
1176                                                         t--; /* invalid */
1177                                         }
1178                                         else
1179                                                 t--; /* invalid */
1180                                 }
1181                                 else
1182                                         t--; /* invalid */
1183                         }
1184                         else if ((byte & 0xfc) == 0xf8) {
1185                                 /* invalid 5-byte */
1186                                 if (t + 4 > tlimit)
1187                                         return len + 1; /* invalid, stop here */
1188
1189                                 skip = 4;
1190                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1191                                         t++;
1192                         }
1193                         else if ((byte & 0xfe) == 0xfc) {
1194                                 /* invalid 6-byte */
1195                                 if (t + 5 > tlimit)
1196                                         return len + 1; /* invalid, stop here */
1197
1198                                 skip = 5;
1199                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1200                                         t++;
1201                         }
1202                         else
1203                                 ; /* invalid */
1204                 }
1205                 else {
1206                         /* NUL */
1207
1208                         if (byte == 0)
1209                                 break;
1210
1211                         /* ASCII character, common case */
1212                 }
1213
1214                 len++;
1215         }
1216
1217         return len;
1218 }
1219
1220
1221 /* utf8_safe_convert_to_u2s ****************************************************
1222
1223    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1224    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1225    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1226
1227    This function is safe even for invalid UTF-8 strings.
1228
1229    IN:
1230       text..........zero-terminated(!) UTF-8 string (may be invalid)
1231                         must NOT be NULL
1232           nbytes........strlen(text). (This is needed to completely emulate
1233                                         the RI).
1234           buffer........a preallocated array of u2s to receive the decoded
1235                         string. Use utf8_safe_number_of_u2s to get the
1236                                         required number of u2s for allocating this.
1237
1238 *******************************************************************************/
1239
1240 #define UNICODE_REPLACEMENT  0xfffd
1241
1242 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1243         register const unsigned char *t;
1244         register s4 byte;
1245         register const unsigned char *tlimit;
1246         s4 byte1;
1247         s4 byte2;
1248         s4 byte3;
1249         s4 value;
1250         s4 skip;
1251
1252         assert(text);
1253         assert(nbytes >= 0);
1254
1255         t = (const unsigned char *) text;
1256         tlimit = t + nbytes;
1257
1258         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1259
1260         while (1) {
1261                 byte = *t++;
1262
1263                 if (byte & 0x80) {
1264                         /* highest bit set, non-ASCII character */
1265
1266                         if ((byte & 0xe0) == 0xc0) {
1267                                 /* 2-byte: should be 110..... 10...... */
1268
1269                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1270                                         /* valid 2-byte UTF-8 */
1271                                         *buffer++ = ((byte  & 0x1f) << 6)
1272                                                           | ((byte1 & 0x3f)     );
1273                                 }
1274                                 else {
1275                                         *buffer++ = UNICODE_REPLACEMENT;
1276                                         t--;
1277                                 }
1278                         }
1279                         else if ((byte & 0xf0) == 0xe0) {
1280                                 /* 3-byte: should be 1110.... 10...... 10...... */
1281
1282                                 if (t + 2 > tlimit) {
1283                                         *buffer++ = UNICODE_REPLACEMENT;
1284                                         return;
1285                                 }
1286
1287                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1288                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1289                                                 /* valid 3-byte UTF-8 */
1290                                                 *buffer++ = ((byte  & 0x0f) << 12)
1291                                                                   | ((byte1 & 0x3f) <<  6)
1292                                                                   | ((byte2 & 0x3f)      );
1293                                         }
1294                                         else {
1295                                                 *buffer++ = UNICODE_REPLACEMENT;
1296                                                 t--;
1297                                         }
1298                                 }
1299                                 else {
1300                                         *buffer++ = UNICODE_REPLACEMENT;
1301                                         t--;
1302                                 }
1303                         }
1304                         else if ((byte & 0xf8) == 0xf0) {
1305                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1306
1307                                 if (t + 3 > tlimit) {
1308                                         *buffer++ = UNICODE_REPLACEMENT;
1309                                         return;
1310                                 }
1311
1312                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1313                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1314                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1315                                                         /* valid 4-byte UTF-8? */
1316                                                         value = ((byte  & 0x07) << 18)
1317                                                                   | ((byte1 & 0x3f) << 12)
1318                                                                   | ((byte2 & 0x3f) <<  6)
1319                                                                   | ((byte3 & 0x3f)      );
1320
1321                                                         if (value > 0x10FFFF) {
1322                                                                 *buffer++ = UNICODE_REPLACEMENT;
1323                                                         }
1324                                                         else if (value > 0xFFFF) {
1325                                                                 /* we need surrogates */
1326                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1327                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1328                                                         }
1329                                                         else
1330                                                                 *buffer++ = value; /* 16bit suffice */
1331                                                 }
1332                                                 else {
1333                                                         *buffer++ = UNICODE_REPLACEMENT;
1334                                                         t--;
1335                                                 }
1336                                         }
1337                                         else {
1338                                                 *buffer++ = UNICODE_REPLACEMENT;
1339                                                 t--;
1340                                         }
1341                                 }
1342                                 else {
1343                                         *buffer++ = UNICODE_REPLACEMENT;
1344                                         t--;
1345                                 }
1346                         }
1347                         else if ((byte & 0xfc) == 0xf8) {
1348                                 if (t + 4 > tlimit) {
1349                                         *buffer++ = UNICODE_REPLACEMENT;
1350                                         return;
1351                                 }
1352
1353                                 skip = 4;
1354                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1355                                         t++;
1356                                 *buffer++ = UNICODE_REPLACEMENT;
1357                         }
1358                         else if ((byte & 0xfe) == 0xfc) {
1359                                 if (t + 5 > tlimit) {
1360                                         *buffer++ = UNICODE_REPLACEMENT;
1361                                         return;
1362                                 }
1363
1364                                 skip = 5;
1365                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1366                                         t++;
1367                                 *buffer++ = UNICODE_REPLACEMENT;
1368                         }
1369                         else
1370                                 *buffer++ = UNICODE_REPLACEMENT;
1371                 }
1372                 else {
1373                         /* NUL */
1374
1375                         if (byte == 0)
1376                                 break;
1377
1378                         /* ASCII character, common case */
1379
1380                         *buffer++ = byte;
1381                 }
1382         }
1383 }
1384
1385
1386 /* u2_utflength ****************************************************************
1387
1388    Returns the utf length in bytes of a u2 array.
1389
1390 *******************************************************************************/
1391
1392 u4 u2_utflength(u2 *text, u4 u2_length)
1393 {
1394         u4 result_len = 0;                  /* utf length in bytes                */
1395         u2 ch;                              /* current unicode character          */
1396         u4 len;
1397         
1398         for (len = 0; len < u2_length; len++) {
1399                 /* next unicode character */
1400                 ch = *text++;
1401           
1402                 /* determine bytes required to store unicode character as utf */
1403                 if (ch && (ch < 0x80)) 
1404                         result_len++;
1405                 else if (ch < 0x800)
1406                         result_len += 2;        
1407                 else 
1408                         result_len += 3;        
1409         }
1410
1411     return result_len;
1412 }
1413
1414
1415 /* utf_copy ********************************************************************
1416
1417    Copy the given utf string byte-for-byte to a buffer.
1418
1419    IN:
1420       buffer.......the buffer
1421           u............the utf string
1422
1423 *******************************************************************************/
1424
1425 void utf_copy(char *buffer, utf *u)
1426 {
1427         /* our utf strings are zero-terminated (done by utf_new) */
1428         MCOPY(buffer, u->text, char, u->blength + 1);
1429 }
1430
1431
1432 /* utf_cat *********************************************************************
1433
1434    Append the given utf string byte-for-byte to a buffer.
1435
1436    IN:
1437       buffer.......the buffer
1438           u............the utf string
1439
1440 *******************************************************************************/
1441
1442 void utf_cat(char *buffer, utf *u)
1443 {
1444         /* our utf strings are zero-terminated (done by utf_new) */
1445         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1446 }
1447
1448
1449 /* utf_copy_classname **********************************************************
1450
1451    Copy the given utf classname byte-for-byte to a buffer.
1452    '/' is replaced by '.'
1453
1454    IN:
1455       buffer.......the buffer
1456           u............the utf string
1457
1458 *******************************************************************************/
1459
1460 void utf_copy_classname(char *buffer, utf *u)
1461 {
1462         char *bufptr;
1463         char *srcptr;
1464         char *endptr;
1465         char ch;
1466
1467         bufptr = buffer;
1468         srcptr = u->text;
1469         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1470
1471         while (srcptr != endptr) {
1472                 ch = *srcptr++;
1473                 if (ch == '/')
1474                         ch = '.';
1475                 *bufptr++ = ch;
1476         }
1477 }
1478
1479
1480 /* utf_cat *********************************************************************
1481
1482    Append the given utf classname byte-for-byte to a buffer.
1483    '/' is replaced by '.'
1484
1485    IN:
1486       buffer.......the buffer
1487           u............the utf string
1488
1489 *******************************************************************************/
1490
1491 void utf_cat_classname(char *buffer, utf *u)
1492 {
1493         utf_copy_classname(buffer + strlen(buffer), u);
1494 }
1495
1496 /* utf_display_printable_ascii *************************************************
1497
1498    Write utf symbol to stdout (for debugging purposes).
1499    Non-printable and non-ASCII characters are printed as '?'.
1500
1501 *******************************************************************************/
1502
1503 void utf_display_printable_ascii(utf *u)
1504 {
1505         char *endpos;                       /* points behind utf string           */
1506         char *utf_ptr;                      /* current position in utf text       */
1507
1508         if (u == NULL) {
1509                 printf("NULL");
1510                 fflush(stdout);
1511                 return;
1512         }
1513
1514         endpos = UTF_END(u);
1515         utf_ptr = u->text;
1516
1517         while (utf_ptr < endpos) {
1518                 /* read next unicode character */
1519
1520                 u2 c = utf_nextu2(&utf_ptr);
1521
1522                 if ((c >= 32) && (c <= 127))
1523                         printf("%c", c);
1524                 else
1525                         printf("?");
1526         }
1527
1528         fflush(stdout);
1529 }
1530
1531
1532 /* utf_display_printable_ascii_classname ***************************************
1533
1534    Write utf symbol to stdout with `/' converted to `.' (for debugging
1535    purposes).
1536    Non-printable and non-ASCII characters are printed as '?'.
1537
1538 *******************************************************************************/
1539
1540 void utf_display_printable_ascii_classname(utf *u)
1541 {
1542         char *endpos;                       /* points behind utf string           */
1543         char *utf_ptr;                      /* current position in utf text       */
1544
1545         if (u == NULL) {
1546                 printf("NULL");
1547                 fflush(stdout);
1548                 return;
1549         }
1550
1551         endpos = UTF_END(u);
1552         utf_ptr = u->text;
1553
1554         while (utf_ptr < endpos) {
1555                 /* read next unicode character */
1556
1557                 u2 c = utf_nextu2(&utf_ptr);
1558
1559                 if (c == '/')
1560                         c = '.';
1561
1562                 if ((c >= 32) && (c <= 127))
1563                         printf("%c", c);
1564                 else
1565                         printf("?");
1566         }
1567
1568         fflush(stdout);
1569 }
1570
1571
1572 /* utf_sprint_convert_to_latin1 ************************************************
1573         
1574    Write utf symbol into c-string (for debugging purposes).
1575    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1576    invalid results.
1577
1578 *******************************************************************************/
1579
1580 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1581 {
1582         char *endpos;                       /* points behind utf string           */
1583         char *utf_ptr;                      /* current position in utf text       */
1584         u2 pos = 0;                         /* position in c-string               */
1585
1586         if (!u) {
1587                 strcpy(buffer, "NULL");
1588                 return;
1589         }
1590
1591         endpos = UTF_END(u);
1592         utf_ptr = u->text;
1593
1594         while (utf_ptr < endpos) 
1595                 /* copy next unicode character */       
1596                 buffer[pos++] = utf_nextu2(&utf_ptr);
1597
1598         /* terminate string */
1599         buffer[pos] = '\0';
1600 }
1601
1602
1603 /* utf_sprint_convert_to_latin1_classname **************************************
1604         
1605    Write utf symbol into c-string with `/' converted to `.' (for debugging
1606    purposes).
1607    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1608    invalid results.
1609
1610 *******************************************************************************/
1611
1612 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1613 {
1614         char *endpos;                       /* points behind utf string           */
1615         char *utf_ptr;                      /* current position in utf text       */
1616         u2 pos = 0;                         /* position in c-string               */
1617
1618         if (!u) {
1619                 strcpy(buffer, "NULL");
1620                 return;
1621         }
1622
1623         endpos = UTF_END(u);
1624         utf_ptr = u->text;
1625
1626         while (utf_ptr < endpos) {
1627                 /* copy next unicode character */       
1628                 u2 c = utf_nextu2(&utf_ptr);
1629                 if (c == '/') c = '.';
1630                 buffer[pos++] = c;
1631         }
1632
1633         /* terminate string */
1634         buffer[pos] = '\0';
1635 }
1636
1637
1638 /* utf_strcat_convert_to_latin1 ************************************************
1639         
1640    Like libc strcat, but uses an utf8 string.
1641    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1642    invalid results.
1643
1644 *******************************************************************************/
1645
1646 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1647 {
1648         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1649 }
1650
1651
1652 /* utf_strcat_convert_to_latin1_classname **************************************
1653         
1654    Like libc strcat, but uses an utf8 string.
1655    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1656    invalid results.
1657
1658 *******************************************************************************/
1659
1660 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1661 {
1662         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1663 }
1664
1665
1666 /* utf_fprint_printable_ascii **************************************************
1667         
1668    Write utf symbol into file.
1669    Non-printable and non-ASCII characters are printed as '?'.
1670
1671 *******************************************************************************/
1672
1673 void utf_fprint_printable_ascii(FILE *file, utf *u)
1674 {
1675         char *endpos;                       /* points behind utf string           */
1676         char *utf_ptr;                      /* current position in utf text       */
1677
1678         if (!u)
1679                 return;
1680
1681         endpos = UTF_END(u);
1682         utf_ptr = u->text;
1683
1684         while (utf_ptr < endpos) { 
1685                 /* read next unicode character */                
1686                 u2 c = utf_nextu2(&utf_ptr);                            
1687
1688                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1689                 else fprintf(file, "?");
1690         }
1691 }
1692
1693
1694 /* utf_fprint_printable_ascii_classname ****************************************
1695         
1696    Write utf symbol into file with `/' converted to `.'.
1697    Non-printable and non-ASCII characters are printed as '?'.
1698
1699 *******************************************************************************/
1700
1701 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1702 {
1703         char *endpos;                       /* points behind utf string           */
1704         char *utf_ptr;                      /* current position in utf text       */
1705
1706     if (!u)
1707                 return;
1708
1709         endpos = UTF_END(u);
1710         utf_ptr = u->text;
1711
1712         while (utf_ptr < endpos) { 
1713                 /* read next unicode character */                
1714                 u2 c = utf_nextu2(&utf_ptr);                            
1715                 if (c == '/') c = '.';
1716
1717                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1718                 else fprintf(file, "?");
1719         }
1720 }
1721
1722
1723 /* is_valid_utf ****************************************************************
1724
1725    Return true if the given string is a valid UTF-8 string.
1726
1727    utf_ptr...points to first character
1728    end_pos...points after last character
1729
1730 *******************************************************************************/
1731
1732 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1733
1734 bool is_valid_utf(char *utf_ptr, char *end_pos)
1735 {
1736         int bytes;
1737         int len,i;
1738         char c;
1739         unsigned long v;
1740
1741         if (end_pos < utf_ptr) return false;
1742         bytes = end_pos - utf_ptr;
1743         while (bytes--) {
1744                 c = *utf_ptr++;
1745
1746                 if (!c) return false;                     /* 0x00 is not allowed */
1747                 if ((c & 0x80) == 0) continue;            /* ASCII */
1748
1749                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1750                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1751                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1752                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1753                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1754                 else return false;                        /* invalid leading byte */
1755
1756                 if (len > 2) return false;                /* Java limitation */
1757
1758                 v = (unsigned long)c & (0x3f >> len);
1759                 
1760                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1761
1762                 for (i = len; i--; ) {
1763                         c = *utf_ptr++;
1764                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1765                                 return false;
1766                         v = (v << 6) | (c & 0x3f);
1767                 }
1768
1769                 if (v == 0) {
1770                         if (len != 1) return false;           /* Java special */
1771
1772                 } else {
1773                         /* Sun Java seems to allow overlong UTF-8 encodings */
1774                         
1775                         /* if (v < min_codepoint[len]) */
1776                                 /* XXX throw exception? */
1777                 }
1778
1779                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1780                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1781
1782                 /* even these seem to be allowed */
1783                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1784         }
1785
1786         return true;
1787 }
1788
1789
1790 /* is_valid_name ***************************************************************
1791
1792    Return true if the given string may be used as a class/field/method
1793    name. (Currently this only disallows empty strings and control
1794    characters.)
1795
1796    NOTE: The string is assumed to have passed is_valid_utf!
1797
1798    utf_ptr...points to first character
1799    end_pos...points after last character
1800
1801 *******************************************************************************/
1802
1803 bool is_valid_name(char *utf_ptr, char *end_pos)
1804 {
1805         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1806
1807         while (utf_ptr < end_pos) {
1808                 unsigned char c = *utf_ptr++;
1809
1810                 if (c < 0x20) return false; /* disallow control characters */
1811                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1812                         return false;
1813         }
1814
1815         return true;
1816 }
1817
1818 bool is_valid_name_utf(utf *u)
1819 {
1820         return is_valid_name(u->text, UTF_END(u));
1821 }
1822
1823
1824 /* utf_show ********************************************************************
1825
1826    Writes the utf symbols in the utfhash to stdout and displays the
1827    number of external hash chains grouped according to the chainlength
1828    (for debugging purposes).
1829
1830 *******************************************************************************/
1831
1832 #if !defined(NDEBUG)
1833 void utf_show(void)
1834 {
1835
1836 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1837
1838         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1839         u4 max_chainlength = 0;      /* maximum length of the chains */
1840         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1841         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1842         u4 i;
1843
1844         printf("UTF-HASH:\n");
1845
1846         /* show element of utf-hashtable */
1847
1848         for (i = 0; i < hashtable_utf->size; i++) {
1849                 utf *u = hashtable_utf->ptr[i];
1850
1851                 if (u) {
1852                         printf("SLOT %d: ", (int) i);
1853
1854                         while (u) {
1855                                 printf("'");
1856                                 utf_display_printable_ascii(u);
1857                                 printf("' ");
1858                                 u = u->hashlink;
1859                         }       
1860                         printf("\n");
1861                 }
1862         }
1863
1864         printf("UTF-HASH: %d slots for %d entries\n", 
1865                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1866
1867         if (hashtable_utf->entries == 0)
1868                 return;
1869
1870         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1871
1872         for (i=0;i<CHAIN_LIMIT;i++)
1873                 chain_count[i]=0;
1874
1875         /* count numbers of hashchains according to their length */
1876         for (i=0; i<hashtable_utf->size; i++) {
1877                   
1878                 utf *u = (utf*) hashtable_utf->ptr[i];
1879                 u4 chain_length = 0;
1880
1881                 /* determine chainlength */
1882                 while (u) {
1883                         u = u->hashlink;
1884                         chain_length++;
1885                 }
1886
1887                 /* update sum of all chainlengths */
1888                 sum_chainlength+=chain_length;
1889
1890                 /* determine the maximum length of the chains */
1891                 if (chain_length>max_chainlength)
1892                         max_chainlength = chain_length;
1893
1894                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1895                 if (chain_length>=CHAIN_LIMIT) {
1896                         beyond_limit+=chain_length;
1897                         chain_length=CHAIN_LIMIT-1;
1898                 }
1899
1900                 /* update number of hashchains of current length */
1901                 chain_count[chain_length]++;
1902         }
1903
1904         /* display results */  
1905         for (i=1;i<CHAIN_LIMIT-1;i++) 
1906                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1907           
1908         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1909
1910
1911         printf("max. chainlength:%5d\n",max_chainlength);
1912
1913         /* avg. chainlength = sum of chainlengths / number of chains */
1914         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1915 }
1916 #endif /* !defined(NDEBUG) */
1917
1918
1919 /*
1920  * These are local overrides for various environment variables in Emacs.
1921  * Please do not remove this and leave it at the end of the file, where
1922  * Emacs will automagically detect them.
1923  * ---------------------------------------------------------------------
1924  * Local variables:
1925  * mode: c
1926  * indent-tabs-mode: t
1927  * c-basic-offset: 4
1928  * tab-width: 4
1929  * End:
1930  * vim:noexpandtab:sw=4:ts=4:
1931  */