d9893ef004391c33cac2f2156235644129ffda5d
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    $Id: utf8.c 8132 2007-06-22 11:15:47Z twisti $
26
27 */
28
29
30 #include "config.h"
31
32 #include <string.h>
33 #include <assert.h>
34
35 #include "vm/types.h"
36
37 #include "mm/memory.h"
38
39 #include "threads/lock-common.h"
40
41 #include "toolbox/hashtable.h"
42
43 #include "vm/exceptions.h"
44
45 #include "vmcore/options.h"
46
47 #if defined(ENABLE_STATISTICS)
48 # include "vmcore/statistics.h"
49 #endif
50
51 #include "vmcore/utf8.h"
52
53
54 /* global variables ***********************************************************/
55
56 /* hashsize must be power of 2 */
57
58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
59
60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
61
62
63 /* utf-symbols for pointer comparison of frequently used strings **************/
64
65 utf *utf_java_lang_Object;
66
67 utf *utf_java_lang_Class;
68 utf *utf_java_lang_ClassLoader;
69 utf *utf_java_lang_Cloneable;
70 utf *utf_java_lang_SecurityManager;
71 utf *utf_java_lang_String;
72 utf *utf_java_lang_System;
73 utf *utf_java_lang_ThreadGroup;
74 utf *utf_java_lang_ref_SoftReference;
75 utf *utf_java_lang_ref_WeakReference;
76 utf *utf_java_lang_ref_PhantomReference;
77 utf *utf_java_io_Serializable;
78
79 utf *utf_java_lang_Throwable;
80 utf *utf_java_lang_Error;
81
82 utf *utf_java_lang_AbstractMethodError;
83 utf *utf_java_lang_ClassCircularityError;
84 utf *utf_java_lang_ClassFormatError;
85 utf *utf_java_lang_ExceptionInInitializerError;
86 utf *utf_java_lang_IncompatibleClassChangeError;
87 utf *utf_java_lang_InstantiationError;
88 utf *utf_java_lang_InternalError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchFieldError;
92 utf *utf_java_lang_NoSuchMethodError;
93 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_UnsatisfiedLinkError;
95 utf *utf_java_lang_UnsupportedClassVersionError;
96 utf *utf_java_lang_VerifyError;
97 utf *utf_java_lang_VirtualMachineError;
98
99 #if defined(WITH_CLASSPATH_GNU)
100 utf *utf_java_lang_VMThrowable;
101 #endif
102
103 utf *utf_java_lang_Exception;
104
105 utf *utf_java_lang_ArithmeticException;
106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
107 utf *utf_java_lang_ArrayStoreException;
108 utf *utf_java_lang_ClassCastException;
109 utf *utf_java_lang_ClassNotFoundException;
110 utf *utf_java_lang_CloneNotSupportedException;
111 utf *utf_java_lang_IllegalAccessException;
112 utf *utf_java_lang_IllegalArgumentException;
113 utf *utf_java_lang_IllegalMonitorStateException;
114 utf *utf_java_lang_InstantiationException;
115 utf *utf_java_lang_InterruptedException;
116 utf *utf_java_lang_NegativeArraySizeException;
117 utf *utf_java_lang_NullPointerException;
118 utf *utf_java_lang_StringIndexOutOfBoundsException;
119
120 utf *utf_java_lang_reflect_InvocationTargetException;
121
122 utf *utf_java_security_PrivilegedActionException;
123
124 #if defined(ENABLE_JAVASE)
125 utf* utf_java_lang_Void;
126 #endif
127
128 utf* utf_java_lang_Boolean;
129 utf* utf_java_lang_Byte;
130 utf* utf_java_lang_Character;
131 utf* utf_java_lang_Short;
132 utf* utf_java_lang_Integer;
133 utf* utf_java_lang_Long;
134 utf* utf_java_lang_Float;
135 utf* utf_java_lang_Double;
136
137 #if defined(ENABLE_JAVASE)
138 utf *utf_java_lang_StackTraceElement;
139 utf *utf_java_lang_reflect_Constructor;
140 utf *utf_java_lang_reflect_Field;
141 utf *utf_java_lang_reflect_Method;
142 utf *utf_java_util_Vector;
143 #endif
144
145 utf *utf_InnerClasses;                  /* InnerClasses                       */
146 utf *utf_ConstantValue;                 /* ConstantValue                      */
147 utf *utf_Code;                          /* Code                               */
148 utf *utf_Exceptions;                    /* Exceptions                         */
149 utf *utf_LineNumberTable;               /* LineNumberTable                    */
150 utf *utf_SourceFile;                    /* SourceFile                         */
151
152 #if defined(ENABLE_JAVASE)
153 utf *utf_EnclosingMethod;
154 utf *utf_Signature;
155 utf *utf_RuntimeVisibleAnnotations;
156 utf *utf_StackMapTable;
157 #endif
158
159 utf *utf_init;                          /* <init>                             */
160 utf *utf_clinit;                        /* <clinit>                           */
161 utf *utf_clone;                         /* clone                              */
162 utf *utf_finalize;                      /* finalize                           */
163 utf *utf_run;                           /* run                                */
164
165 utf *utf_add;
166 utf *utf_remove;
167 utf *utf_addThread;
168 utf *utf_removeThread;
169 utf *utf_put;
170 utf *utf_get;
171 utf *utf_value;
172
173 utf *utf_fillInStackTrace;
174 utf *utf_findNative;
175 utf *utf_getSystemClassLoader;
176 utf *utf_initCause;
177 utf *utf_loadClass;
178 utf *utf_printStackTrace;
179
180 utf *utf_division_by_zero;
181
182 utf *utf_Z;                             /* Z                                  */
183 utf *utf_B;                             /* B                                  */
184 utf *utf_C;                             /* C                                  */
185 utf *utf_S;                             /* S                                  */
186 utf *utf_I;                             /* I                                  */
187 utf *utf_J;                             /* J                                  */
188 utf *utf_F;                             /* F                                  */
189 utf *utf_D;                             /* D                                  */
190
191 utf *utf_void__void;                    /* ()V                                */
192 utf *utf_boolean__void;                 /* (Z)V                               */
193 utf *utf_byte__void;                    /* (B)V                               */
194 utf *utf_char__void;                    /* (C)V                               */
195 utf *utf_short__void;                   /* (S)V                               */
196 utf *utf_int__void;                     /* (I)V                               */
197 utf *utf_long__void;                    /* (J)V                               */
198 utf *utf_float__void;                   /* (F)V                               */
199 utf *utf_double__void;                  /* (D)V                               */
200
201 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
202 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
203 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
204 utf *utf_java_lang_ClassLoader_java_lang_String__J;
205 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
206 utf *utf_java_lang_Object__java_lang_Object;
207 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
208 utf *utf_java_lang_String__java_lang_Class;
209 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
210 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
211 utf *utf_java_lang_Throwable__java_lang_Throwable;
212
213 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
214 utf *utf_null;
215 utf *array_packagename;
216
217
218 /* utf_init ********************************************************************
219
220    Initializes the utf8 subsystem.
221
222 *******************************************************************************/
223
224 bool utf8_init(void)
225 {
226         /* create utf8 hashtable */
227
228         hashtable_utf = NEW(hashtable);
229
230         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
231
232 #if defined(ENABLE_STATISTICS)
233         if (opt_stat)
234                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
235 #endif
236
237         /* create utf-symbols for pointer comparison of frequently used strings */
238
239         utf_java_lang_Object           = utf_new_char("java/lang/Object");
240
241         utf_java_lang_Class            = utf_new_char("java/lang/Class");
242         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
243         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
244         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
245         utf_java_lang_String           = utf_new_char("java/lang/String");
246         utf_java_lang_System           = utf_new_char("java/lang/System");
247         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
248
249         utf_java_lang_ref_SoftReference =
250                 utf_new_char("java/lang/ref/SoftReference");
251
252         utf_java_lang_ref_WeakReference =
253                 utf_new_char("java/lang/ref/WeakReference");
254
255         utf_java_lang_ref_PhantomReference =
256                 utf_new_char("java/lang/ref/PhantomReference");
257
258         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
259
260         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
261         utf_java_lang_Error            = utf_new_char("java/lang/Error");
262
263         utf_java_lang_ClassCircularityError =
264                 utf_new_char("java/lang/ClassCircularityError");
265
266         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
267
268         utf_java_lang_ExceptionInInitializerError =
269                 utf_new_char("java/lang/ExceptionInInitializerError");
270
271         utf_java_lang_IncompatibleClassChangeError =
272                 utf_new_char("java/lang/IncompatibleClassChangeError");
273
274         utf_java_lang_InstantiationError =
275                 utf_new_char("java/lang/InstantiationError");
276
277         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
278         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
279
280         utf_java_lang_NoClassDefFoundError =
281                 utf_new_char("java/lang/NoClassDefFoundError");
282
283         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
284
285         utf_java_lang_UnsatisfiedLinkError =
286                 utf_new_char("java/lang/UnsatisfiedLinkError");
287
288         utf_java_lang_UnsupportedClassVersionError =
289                 utf_new_char("java/lang/UnsupportedClassVersionError");
290
291         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
292
293         utf_java_lang_VirtualMachineError =
294                 utf_new_char("java/lang/VirtualMachineError");
295
296 #if defined(ENABLE_JAVASE)
297         utf_java_lang_AbstractMethodError =
298                 utf_new_char("java/lang/AbstractMethodError");
299
300         utf_java_lang_NoSuchFieldError =
301                 utf_new_char("java/lang/NoSuchFieldError");
302
303         utf_java_lang_NoSuchMethodError =
304                 utf_new_char("java/lang/NoSuchMethodError");
305 #endif
306
307 #if defined(WITH_CLASSPATH_GNU)
308         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
309 #endif
310
311         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
312
313         utf_java_lang_ArithmeticException =
314                 utf_new_char("java/lang/ArithmeticException");
315
316         utf_java_lang_ArrayIndexOutOfBoundsException =
317                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
318
319         utf_java_lang_ArrayStoreException =
320                 utf_new_char("java/lang/ArrayStoreException");
321
322         utf_java_lang_ClassCastException =
323                 utf_new_char("java/lang/ClassCastException");
324
325         utf_java_lang_ClassNotFoundException =
326                 utf_new_char("java/lang/ClassNotFoundException");
327
328         utf_java_lang_CloneNotSupportedException =
329                 utf_new_char("java/lang/CloneNotSupportedException");
330
331         utf_java_lang_IllegalAccessException =
332                 utf_new_char("java/lang/IllegalAccessException");
333
334         utf_java_lang_IllegalArgumentException =
335                 utf_new_char("java/lang/IllegalArgumentException");
336
337         utf_java_lang_IllegalMonitorStateException =
338                 utf_new_char("java/lang/IllegalMonitorStateException");
339
340         utf_java_lang_InstantiationException =
341                 utf_new_char("java/lang/InstantiationException");
342
343         utf_java_lang_InterruptedException =
344                 utf_new_char("java/lang/InterruptedException");
345
346         utf_java_lang_NegativeArraySizeException =
347                 utf_new_char("java/lang/NegativeArraySizeException");
348
349         utf_java_lang_NullPointerException =
350                 utf_new_char("java/lang/NullPointerException");
351
352         utf_java_lang_StringIndexOutOfBoundsException =
353                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
354
355         utf_java_lang_reflect_InvocationTargetException =
356                 utf_new_char("java/lang/reflect/InvocationTargetException");
357
358         utf_java_security_PrivilegedActionException =
359                 utf_new_char("java/security/PrivilegedActionException");
360  
361 #if defined(ENABLE_JAVASE)
362         utf_java_lang_Void             = utf_new_char("java/lang/Void");
363 #endif
364
365         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
366         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
367         utf_java_lang_Character        = utf_new_char("java/lang/Character");
368         utf_java_lang_Short            = utf_new_char("java/lang/Short");
369         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
370         utf_java_lang_Long             = utf_new_char("java/lang/Long");
371         utf_java_lang_Float            = utf_new_char("java/lang/Float");
372         utf_java_lang_Double           = utf_new_char("java/lang/Double");
373
374 #if defined(ENABLE_JAVASE)
375         utf_java_lang_StackTraceElement =
376                 utf_new_char("java/lang/StackTraceElement");
377
378         utf_java_lang_reflect_Constructor =
379                 utf_new_char("java/lang/reflect/Constructor");
380
381         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
382         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
383         utf_java_util_Vector           = utf_new_char("java/util/Vector");
384 #endif
385
386         utf_InnerClasses               = utf_new_char("InnerClasses");
387         utf_ConstantValue              = utf_new_char("ConstantValue");
388         utf_Code                       = utf_new_char("Code");
389         utf_Exceptions                 = utf_new_char("Exceptions");
390         utf_LineNumberTable            = utf_new_char("LineNumberTable");
391         utf_SourceFile                 = utf_new_char("SourceFile");
392
393 #if defined(ENABLE_JAVASE)
394         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
395         utf_Signature                  = utf_new_char("Signature");
396         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
397         utf_StackMapTable              = utf_new_char("StackMapTable");
398 #endif
399
400         utf_init                           = utf_new_char("<init>");
401         utf_clinit                         = utf_new_char("<clinit>");
402         utf_clone                      = utf_new_char("clone");
403         utf_finalize                   = utf_new_char("finalize");
404         utf_run                        = utf_new_char("run");
405
406         utf_add                        = utf_new_char("add");
407         utf_remove                     = utf_new_char("remove");
408         utf_addThread                  = utf_new_char("addThread");
409         utf_removeThread               = utf_new_char("removeThread");
410         utf_put                        = utf_new_char("put");
411         utf_get                        = utf_new_char("get");
412         utf_value                      = utf_new_char("value");
413
414         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
415         utf_findNative                 = utf_new_char("findNative");
416         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
417         utf_initCause                  = utf_new_char("initCause");
418         utf_loadClass                  = utf_new_char("loadClass");
419         utf_printStackTrace            = utf_new_char("printStackTrace");
420
421         utf_division_by_zero           = utf_new_char("/ by zero");
422
423         utf_Z                          = utf_new_char("Z");
424         utf_B                          = utf_new_char("B");
425         utf_C                          = utf_new_char("C");
426         utf_S                          = utf_new_char("S");
427         utf_I                          = utf_new_char("I");
428         utf_J                          = utf_new_char("J");
429         utf_F                          = utf_new_char("F");
430         utf_D                          = utf_new_char("D");
431
432         utf_void__void                 = utf_new_char("()V");
433         utf_boolean__void              = utf_new_char("(Z)V");
434         utf_byte__void                 = utf_new_char("(B)V");
435         utf_char__void                 = utf_new_char("(C)V");
436         utf_short__void                = utf_new_char("(S)V");
437         utf_int__void                  = utf_new_char("(I)V");
438         utf_long__void                 = utf_new_char("(J)V");
439         utf_float__void                = utf_new_char("(F)V");
440         utf_double__void               = utf_new_char("(D)V");
441         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
442         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
443
444         utf_void__java_lang_ClassLoader =
445                 utf_new_char("()Ljava/lang/ClassLoader;");
446
447         utf_java_lang_ClassLoader_java_lang_String__J =
448                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
449
450         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
451
452         utf_java_lang_Object__java_lang_Object =
453                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
454
455         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
456
457         utf_java_lang_String__java_lang_Class =
458                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
459
460         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
461         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
462
463         utf_java_lang_Throwable__java_lang_Throwable =
464                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
465
466         utf_null                       = utf_new_char("null");
467         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
468         array_packagename              = utf_new_char("\t<the array package>");
469
470         /* everything's ok */
471
472         return true;
473 }
474
475
476 /* utf_hashkey *****************************************************************
477
478    The hashkey is computed from the utf-text by using up to 8
479    characters.  For utf-symbols longer than 15 characters 3 characters
480    are taken from the beginning and the end, 2 characters are taken
481    from the middle.
482
483 *******************************************************************************/
484
485 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
486 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
487
488 u4 utf_hashkey(const char *text, u4 length)
489 {
490         const char *start_pos = text;       /* pointer to utf text                */
491         u4 a;
492
493         switch (length) {
494         case 0: /* empty string */
495                 return 0;
496
497         case 1: return fbs(0);
498         case 2: return fbs(0) ^ nbs(3);
499         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
500         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
501         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
502         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
503         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
504         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
505
506         case 9:
507                 a = fbs(0);
508                 a ^= nbs(1);
509                 a ^= nbs(2);
510                 text++;
511                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
512
513         case 10:
514                 a = fbs(0);
515                 text++;
516                 a ^= nbs(2);
517                 a ^= nbs(3);
518                 a ^= nbs(4);
519                 text++;
520                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
521
522         case 11:
523                 a = fbs(0);
524                 text++;
525                 a ^= nbs(2);
526                 a ^= nbs(3);
527                 a ^= nbs(4);
528                 text++;
529                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
530
531         case 12:
532                 a = fbs(0);
533                 text += 2;
534                 a ^= nbs(2);
535                 a ^= nbs(3);
536                 text++;
537                 a ^= nbs(5);
538                 a ^= nbs(6);
539                 a ^= nbs(7);
540                 text++;
541                 return a ^ nbs(9) ^ nbs(10);
542
543         case 13:
544                 a = fbs(0);
545                 a ^= nbs(1);
546                 text++;
547                 a ^= nbs(3);
548                 a ^= nbs(4);
549                 text += 2;      
550                 a ^= nbs(7);
551                 a ^= nbs(8);
552                 text += 2;
553                 return a ^ nbs(9) ^ nbs(10);
554
555         case 14:
556                 a = fbs(0);
557                 text += 2;      
558                 a ^= nbs(3);
559                 a ^= nbs(4);
560                 text += 2;      
561                 a ^= nbs(7);
562                 a ^= nbs(8);
563                 text += 2;
564                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
565
566         case 15:
567                 a = fbs(0);
568                 text += 2;      
569                 a ^= nbs(3);
570                 a ^= nbs(4);
571                 text += 2;      
572                 a ^= nbs(7);
573                 a ^= nbs(8);
574                 text += 2;
575                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
576
577         default:  /* 3 characters from beginning */
578                 a = fbs(0);
579                 text += 2;
580                 a ^= nbs(3);
581                 a ^= nbs(4);
582
583                 /* 2 characters from middle */
584                 text = start_pos + (length / 2);
585                 a ^= fbs(5);
586                 text += 2;
587                 a ^= nbs(6);    
588
589                 /* 3 characters from end */
590                 text = start_pos + length - 4;
591
592                 a ^= fbs(7);
593                 text++;
594
595                 return a ^ nbs(10) ^ nbs(11);
596     }
597 }
598
599 /* utf_full_hashkey ************************************************************
600
601    This function computes a hash value using all bytes in the string.
602
603    The algorithm is the "One-at-a-time" algorithm as published
604    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
605
606 *******************************************************************************/
607
608 u4 utf_full_hashkey(const char *text, u4 length)
609 {
610         register const unsigned char *p = (const unsigned char *) text;
611         register u4 hash;
612         register u4 i;
613
614         hash = 0;
615         for (i=length; i--;)
616         {
617             hash += *p++;
618             hash += (hash << 10);
619             hash ^= (hash >> 6);
620         }
621         hash += (hash << 3);
622         hash ^= (hash >> 11);
623         hash += (hash << 15);
624
625         return hash;
626 }
627
628 /* unicode_hashkey *************************************************************
629
630    Compute the hashkey of a unicode string.
631
632 *******************************************************************************/
633
634 u4 unicode_hashkey(u2 *text, u2 len)
635 {
636         return utf_hashkey((char *) text, len);
637 }
638
639
640 /* utf_new *********************************************************************
641
642    Creates a new utf-symbol, the text of the symbol is passed as a
643    u1-array. The function searches the utf-hashtable for a utf-symbol
644    with this text. On success the element returned, otherwise a new
645    hashtable element is created.
646
647    If the number of entries in the hashtable exceeds twice the size of
648    the hashtable slots a reorganization of the hashtable is done and
649    the utf symbols are copied to a new hashtable with doubled size.
650
651 *******************************************************************************/
652
653 utf *utf_new(const char *text, u2 length)
654 {
655         u4 key;                             /* hashkey computed from utf-text     */
656         u4 slot;                            /* slot in hashtable                  */
657         utf *u;                             /* hashtable element                  */
658         u2 i;
659
660         LOCK_MONITOR_ENTER(hashtable_utf->header);
661
662 #if defined(ENABLE_STATISTICS)
663         if (opt_stat)
664                 count_utf_new++;
665 #endif
666
667         key  = utf_hashkey(text, length);
668         slot = key & (hashtable_utf->size - 1);
669         u    = hashtable_utf->ptr[slot];
670
671         /* search external hash chain for utf-symbol */
672
673         while (u) {
674                 if (u->blength == length) {
675                         /* compare text of hashtable elements */
676
677                         for (i = 0; i < length; i++)
678                                 if (text[i] != u->text[i])
679                                         goto nomatch;
680                         
681 #if defined(ENABLE_STATISTICS)
682                         if (opt_stat)
683                                 count_utf_new_found++;
684 #endif
685
686                         /* symbol found in hashtable */
687
688                         LOCK_MONITOR_EXIT(hashtable_utf->header);
689
690                         return u;
691                 }
692
693         nomatch:
694                 u = u->hashlink; /* next element in external chain */
695         }
696
697         /* location in hashtable found, create new utf element */
698
699         u = NEW(utf);
700
701         u->blength  = length;               /* length in bytes of utfstring       */
702         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
703         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
704
705         memcpy(u->text, text, length);      /* copy utf-text                      */
706         u->text[length] = '\0';
707
708 #if defined(ENABLE_STATISTICS)
709         if (opt_stat)
710                 count_utf_len += sizeof(utf) + length + 1;
711 #endif
712
713         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
714         hashtable_utf->entries++;           /* update number of entries           */
715
716         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
717
718         /* reorganization of hashtable, average length of the external
719            chains is approx. 2 */
720
721                 hashtable *newhash;                              /* the new hashtable */
722                 u4         i;
723                 utf       *u;
724                 utf       *nextu;
725                 u4         slot;
726
727                 /* create new hashtable, double the size */
728
729                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
730
731 #if defined(ENABLE_STATISTICS)
732                 if (opt_stat)
733                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
734 #endif
735
736                 /* transfer elements to new hashtable */
737
738                 for (i = 0; i < hashtable_utf->size; i++) {
739                         u = hashtable_utf->ptr[i];
740
741                         while (u) {
742                                 nextu = u->hashlink;
743                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
744                                                 
745                                 u->hashlink = (utf *) newhash->ptr[slot];
746                                 newhash->ptr[slot] = u;
747
748                                 /* follow link in external hash chain */
749
750                                 u = nextu;
751                         }
752                 }
753         
754                 /* dispose old table */
755
756                 hashtable_free(hashtable_utf);
757
758                 hashtable_utf = newhash;
759         }
760
761         LOCK_MONITOR_EXIT(hashtable_utf->header);
762
763         return u;
764 }
765
766
767 /* utf_new_u2 ******************************************************************
768
769    Make utf symbol from u2 array, if isclassname is true '.' is
770    replaced by '/'.
771
772 *******************************************************************************/
773
774 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
775 {
776         char *buffer;                   /* memory buffer for  unicode characters  */
777         char *pos;                      /* pointer to current position in buffer  */
778         u4 left;                        /* unicode characters left                */
779         u4 buflength;                   /* utf length in bytes of the u2 array    */
780         utf *result;                    /* resulting utf-string                   */
781         int i;          
782
783         /* determine utf length in bytes and allocate memory */
784
785         buflength = u2_utflength(unicode_pos, unicode_length); 
786         buffer    = MNEW(char, buflength);
787  
788         left = buflength;
789         pos  = buffer;
790
791         for (i = 0; i++ < unicode_length; unicode_pos++) {
792                 /* next unicode character */
793                 u2 c = *unicode_pos;
794                 
795                 if ((c != 0) && (c < 0x80)) {
796                         /* 1 character */       
797                         left--;
798                 if ((int) left < 0) break;
799                         /* convert classname */
800                         if (isclassname && c == '.')
801                                 *pos++ = '/';
802                         else
803                                 *pos++ = (char) c;
804
805                 } else if (c < 0x800) {             
806                         /* 2 characters */                              
807                 unsigned char high = c >> 6;
808                 unsigned char low  = c & 0x3F;
809                         left = left - 2;
810                 if ((int) left < 0) break;
811                 *pos++ = high | 0xC0; 
812                 *pos++ = low  | 0x80;     
813
814                 } else {         
815                 /* 3 characters */                              
816                 char low  = c & 0x3f;
817                 char mid  = (c >> 6) & 0x3F;
818                 char high = c >> 12;
819                         left = left - 3;
820                 if ((int) left < 0) break;
821                 *pos++ = high | 0xE0; 
822                 *pos++ = mid  | 0x80;  
823                 *pos++ = low  | 0x80;   
824                 }
825         }
826         
827         /* insert utf-string into symbol-table */
828         result = utf_new(buffer,buflength);
829
830         MFREE(buffer, char, buflength);
831
832         return result;
833 }
834
835
836 /* utf_new_char ****************************************************************
837
838    Creates a new utf symbol, the text for this symbol is passed as a
839    c-string ( = char* ).
840
841 *******************************************************************************/
842
843 utf *utf_new_char(const char *text)
844 {
845         return utf_new(text, strlen(text));
846 }
847
848
849 /* utf_new_char_classname ******************************************************
850
851    Creates a new utf symbol, the text for this symbol is passed as a
852    c-string ( = char* ) "." characters are going to be replaced by
853    "/". Since the above function is used often, this is a separte
854    function, instead of an if.
855
856 *******************************************************************************/
857
858 utf *utf_new_char_classname(const char *text)
859 {
860         if (strchr(text, '.')) {
861                 char *txt = strdup(text);
862                 char *end = txt + strlen(txt);
863                 char *c;
864                 utf *tmpRes;
865
866                 for (c = txt; c < end; c++)
867                         if (*c == '.') *c = '/';
868
869                 tmpRes = utf_new(txt, strlen(txt));
870                 FREE(txt, 0);
871
872                 return tmpRes;
873
874         } else
875                 return utf_new(text, strlen(text));
876 }
877
878
879 /* utf_nextu2 ******************************************************************
880
881    Read the next unicode character from the utf string and increment
882    the utf-string pointer accordingly.
883
884    CAUTION: This function is unsafe for input that was not checked 
885             by is_valid_utf!
886
887 *******************************************************************************/
888
889 u2 utf_nextu2(char **utf_ptr)
890 {
891     /* uncompressed unicode character */
892     u2 unicode_char = 0;
893     /* current position in utf text */  
894     unsigned char *utf = (unsigned char *) (*utf_ptr);
895     /* bytes representing the unicode character */
896     unsigned char ch1, ch2, ch3;
897     /* number of bytes used to represent the unicode character */
898     int len = 0;
899         
900     switch ((ch1 = utf[0]) >> 4) {
901         default: /* 1 byte */
902                 (*utf_ptr)++;
903                 return (u2) ch1;
904         case 0xC: 
905         case 0xD: /* 2 bytes */
906                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
907                         unsigned char high = ch1 & 0x1F;
908                         unsigned char low  = ch2 & 0x3F;
909                         unicode_char = (high << 6) + low;
910                         len = 2;
911                 }
912                 break;
913
914         case 0xE: /* 2 or 3 bytes */
915                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
916                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
917                                 unsigned char low  = ch3 & 0x3f;
918                                 unsigned char mid  = ch2 & 0x3f;
919                                 unsigned char high = ch1 & 0x0f;
920                                 unicode_char = (((high << 6) + mid) << 6) + low;
921                                 len = 3;
922                         } else
923                                 len = 2;                                           
924                 }
925                 break;
926     }
927
928     /* update position in utf-text */
929     *utf_ptr = (char *) (utf + len);
930
931     return unicode_char;
932 }
933
934
935 /* utf_bytes *******************************************************************
936
937    Determine number of bytes (aka. octets) in the utf string.
938
939    IN:
940       u............utf string
941
942    OUT:
943       The number of octets of this utf string.
944           There is _no_ terminating zero included in this count.
945
946 *******************************************************************************/
947
948 u4 utf_bytes(utf *u)
949 {
950         return u->blength;
951 }
952
953
954 /* utf_get_number_of_u2s_for_buffer ********************************************
955
956    Determine number of UTF-16 u2s in the given UTF-8 buffer
957
958    CAUTION: This function is unsafe for input that was not checked 
959             by is_valid_utf!
960
961    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
962    to an array of u2s (UTF-16) and want to know how many of them you will get.
963    All other uses of this function are probably wrong.
964
965    IN:
966       buffer........points to first char in buffer
967           blength.......number of _bytes_ in the buffer
968
969    OUT:
970       the number of u2s needed to hold this string in UTF-16 encoding.
971           There is _no_ terminating zero included in this count.
972
973    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
974    exception.
975
976 *******************************************************************************/
977
978 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
979 {
980         const char *endpos;                 /* points behind utf string           */
981         const char *utf_ptr;                /* current position in utf text       */
982         u4 len = 0;                         /* number of unicode characters       */
983
984         utf_ptr = buffer;
985         endpos = utf_ptr + blength;
986
987         while (utf_ptr < endpos) {
988                 len++;
989                 /* next unicode character */
990                 utf_nextu2((char **)&utf_ptr);
991         }
992
993         assert(utf_ptr == endpos);
994
995         return len;
996 }
997
998
999 /* utf_get_number_of_u2s *******************************************************
1000
1001    Determine number of UTF-16 u2s in the utf string.
1002
1003    CAUTION: This function is unsafe for input that was not checked 
1004             by is_valid_utf!
1005
1006    CAUTION: Use this function *only* when you want to convert a utf string
1007    to an array of u2s and want to know how many of them you will get.
1008    All other uses of this function are probably wrong.
1009
1010    IN:
1011       u............utf string
1012
1013    OUT:
1014       the number of u2s needed to hold this string in UTF-16 encoding.
1015           There is _no_ terminating zero included in this count.
1016           XXX 0 if a NullPointerException has been thrown (see below)
1017
1018 *******************************************************************************/
1019
1020 u4 utf_get_number_of_u2s(utf *u)
1021 {
1022         char *endpos;                       /* points behind utf string           */
1023         char *utf_ptr;                      /* current position in utf text       */
1024         u4 len = 0;                         /* number of unicode characters       */
1025
1026         /* XXX this is probably not checked by most callers! Review this after */
1027         /* the invalid uses of this function have been eliminated */
1028         if (u == NULL) {
1029                 exceptions_throw_nullpointerexception();
1030                 return 0;
1031         }
1032
1033         endpos = UTF_END(u);
1034         utf_ptr = u->text;
1035
1036         while (utf_ptr < endpos) {
1037                 len++;
1038                 /* next unicode character */
1039                 utf_nextu2(&utf_ptr);
1040         }
1041
1042         if (utf_ptr != endpos) {
1043                 /* string ended abruptly */
1044                 exceptions_throw_internalerror("Illegal utf8 string");
1045                 return 0;
1046         }
1047
1048         return len;
1049 }
1050
1051
1052 /* utf8_safe_number_of_u2s *****************************************************
1053
1054    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1055    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1056
1057    This function is safe even for invalid UTF-8 strings.
1058
1059    IN:
1060       text..........zero-terminated(!) UTF-8 string (may be invalid)
1061                         must NOT be NULL
1062           nbytes........strlen(text). (This is needed to completely emulate
1063                         the RI).
1064
1065    OUT:
1066       the number of u2s needed to hold this string in UTF-16 encoding.
1067           There is _no_ terminating zero included in this count.
1068
1069 *******************************************************************************/
1070
1071 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1072         register const unsigned char *t;
1073         register s4 byte;
1074         register s4 len;
1075         register const unsigned char *tlimit;
1076         s4 byte1;
1077         s4 byte2;
1078         s4 byte3;
1079         s4 value;
1080         s4 skip;
1081
1082         assert(text);
1083         assert(nbytes >= 0);
1084
1085         len = 0;
1086         t = (const unsigned char *) text;
1087         tlimit = t + nbytes;
1088
1089         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1090
1091         while (1) {
1092                 byte = *t++;
1093
1094                 if (byte & 0x80) {
1095                         /* highest bit set, non-ASCII character */
1096
1097                         if ((byte & 0xe0) == 0xc0) {
1098                                 /* 2-byte: should be 110..... 10...... ? */
1099
1100                                 if ((*t++ & 0xc0) == 0x80)
1101                                         ; /* valid 2-byte */
1102                                 else
1103                                         t--; /* invalid */
1104                         }
1105                         else if ((byte & 0xf0) == 0xe0) {
1106                                 /* 3-byte: should be 1110.... 10...... 10...... */
1107                                 /*                            ^t                */
1108
1109                                 if (t + 2 > tlimit)
1110                                         return len + 1; /* invalid, stop here */
1111
1112                                 if ((*t++ & 0xc0) == 0x80) {
1113                                         if ((*t++ & 0xc0) == 0x80)
1114                                                 ; /* valid 3-byte */
1115                                         else
1116                                                 t--; /* invalid */
1117                                 }
1118                                 else
1119                                         t--; /* invalid */
1120                         }
1121                         else if ((byte & 0xf8) == 0xf0) {
1122                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1123                                 /*                            ^t                         */
1124
1125                                 if (t + 3 > tlimit)
1126                                         return len + 1; /* invalid, stop here */
1127
1128                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1129                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1130                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1131                                                         /* valid 4-byte UTF-8? */
1132                                                         value = ((byte  & 0x07) << 18)
1133                                                                   | ((byte1 & 0x3f) << 12)
1134                                                                   | ((byte2 & 0x3f) <<  6)
1135                                                                   | ((byte3 & 0x3f)      );
1136
1137                                                         if (value > 0x10FFFF)
1138                                                                 ; /* invalid */
1139                                                         else if (value > 0xFFFF)
1140                                                                 len += 1; /* we need surrogates */
1141                                                         else
1142                                                                 ; /* 16bit suffice */
1143                                                 }
1144                                                 else
1145                                                         t--; /* invalid */
1146                                         }
1147                                         else
1148                                                 t--; /* invalid */
1149                                 }
1150                                 else
1151                                         t--; /* invalid */
1152                         }
1153                         else if ((byte & 0xfc) == 0xf8) {
1154                                 /* invalid 5-byte */
1155                                 if (t + 4 > tlimit)
1156                                         return len + 1; /* invalid, stop here */
1157
1158                                 skip = 4;
1159                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1160                                         t++;
1161                         }
1162                         else if ((byte & 0xfe) == 0xfc) {
1163                                 /* invalid 6-byte */
1164                                 if (t + 5 > tlimit)
1165                                         return len + 1; /* invalid, stop here */
1166
1167                                 skip = 5;
1168                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1169                                         t++;
1170                         }
1171                         else
1172                                 ; /* invalid */
1173                 }
1174                 else {
1175                         /* NUL */
1176
1177                         if (byte == 0)
1178                                 break;
1179
1180                         /* ASCII character, common case */
1181                 }
1182
1183                 len++;
1184         }
1185
1186         return len;
1187 }
1188
1189
1190 /* utf8_safe_convert_to_u2s ****************************************************
1191
1192    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1193    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1194    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1195
1196    This function is safe even for invalid UTF-8 strings.
1197
1198    IN:
1199       text..........zero-terminated(!) UTF-8 string (may be invalid)
1200                         must NOT be NULL
1201           nbytes........strlen(text). (This is needed to completely emulate
1202                                         the RI).
1203           buffer........a preallocated array of u2s to receive the decoded
1204                         string. Use utf8_safe_number_of_u2s to get the
1205                                         required number of u2s for allocating this.
1206
1207 *******************************************************************************/
1208
1209 #define UNICODE_REPLACEMENT  0xfffd
1210
1211 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1212         register const unsigned char *t;
1213         register s4 byte;
1214         register const unsigned char *tlimit;
1215         s4 byte1;
1216         s4 byte2;
1217         s4 byte3;
1218         s4 value;
1219         s4 skip;
1220
1221         assert(text);
1222         assert(nbytes >= 0);
1223
1224         t = (const unsigned char *) text;
1225         tlimit = t + nbytes;
1226
1227         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1228
1229         while (1) {
1230                 byte = *t++;
1231
1232                 if (byte & 0x80) {
1233                         /* highest bit set, non-ASCII character */
1234
1235                         if ((byte & 0xe0) == 0xc0) {
1236                                 /* 2-byte: should be 110..... 10...... */
1237
1238                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1239                                         /* valid 2-byte UTF-8 */
1240                                         *buffer++ = ((byte  & 0x1f) << 6)
1241                                                           | ((byte1 & 0x3f)     );
1242                                 }
1243                                 else {
1244                                         *buffer++ = UNICODE_REPLACEMENT;
1245                                         t--;
1246                                 }
1247                         }
1248                         else if ((byte & 0xf0) == 0xe0) {
1249                                 /* 3-byte: should be 1110.... 10...... 10...... */
1250
1251                                 if (t + 2 > tlimit) {
1252                                         *buffer++ = UNICODE_REPLACEMENT;
1253                                         return;
1254                                 }
1255
1256                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1257                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1258                                                 /* valid 3-byte UTF-8 */
1259                                                 *buffer++ = ((byte  & 0x0f) << 12)
1260                                                                   | ((byte1 & 0x3f) <<  6)
1261                                                                   | ((byte2 & 0x3f)      );
1262                                         }
1263                                         else {
1264                                                 *buffer++ = UNICODE_REPLACEMENT;
1265                                                 t--;
1266                                         }
1267                                 }
1268                                 else {
1269                                         *buffer++ = UNICODE_REPLACEMENT;
1270                                         t--;
1271                                 }
1272                         }
1273                         else if ((byte & 0xf8) == 0xf0) {
1274                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1275
1276                                 if (t + 3 > tlimit) {
1277                                         *buffer++ = UNICODE_REPLACEMENT;
1278                                         return;
1279                                 }
1280
1281                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1282                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1283                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1284                                                         /* valid 4-byte UTF-8? */
1285                                                         value = ((byte  & 0x07) << 18)
1286                                                                   | ((byte1 & 0x3f) << 12)
1287                                                                   | ((byte2 & 0x3f) <<  6)
1288                                                                   | ((byte3 & 0x3f)      );
1289
1290                                                         if (value > 0x10FFFF) {
1291                                                                 *buffer++ = UNICODE_REPLACEMENT;
1292                                                         }
1293                                                         else if (value > 0xFFFF) {
1294                                                                 /* we need surrogates */
1295                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1296                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1297                                                         }
1298                                                         else
1299                                                                 *buffer++ = value; /* 16bit suffice */
1300                                                 }
1301                                                 else {
1302                                                         *buffer++ = UNICODE_REPLACEMENT;
1303                                                         t--;
1304                                                 }
1305                                         }
1306                                         else {
1307                                                 *buffer++ = UNICODE_REPLACEMENT;
1308                                                 t--;
1309                                         }
1310                                 }
1311                                 else {
1312                                         *buffer++ = UNICODE_REPLACEMENT;
1313                                         t--;
1314                                 }
1315                         }
1316                         else if ((byte & 0xfc) == 0xf8) {
1317                                 if (t + 4 > tlimit) {
1318                                         *buffer++ = UNICODE_REPLACEMENT;
1319                                         return;
1320                                 }
1321
1322                                 skip = 4;
1323                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1324                                         t++;
1325                                 *buffer++ = UNICODE_REPLACEMENT;
1326                         }
1327                         else if ((byte & 0xfe) == 0xfc) {
1328                                 if (t + 5 > tlimit) {
1329                                         *buffer++ = UNICODE_REPLACEMENT;
1330                                         return;
1331                                 }
1332
1333                                 skip = 5;
1334                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1335                                         t++;
1336                                 *buffer++ = UNICODE_REPLACEMENT;
1337                         }
1338                         else
1339                                 *buffer++ = UNICODE_REPLACEMENT;
1340                 }
1341                 else {
1342                         /* NUL */
1343
1344                         if (byte == 0)
1345                                 break;
1346
1347                         /* ASCII character, common case */
1348
1349                         *buffer++ = byte;
1350                 }
1351         }
1352 }
1353
1354
1355 /* u2_utflength ****************************************************************
1356
1357    Returns the utf length in bytes of a u2 array.
1358
1359 *******************************************************************************/
1360
1361 u4 u2_utflength(u2 *text, u4 u2_length)
1362 {
1363         u4 result_len = 0;                  /* utf length in bytes                */
1364         u2 ch;                              /* current unicode character          */
1365         u4 len;
1366         
1367         for (len = 0; len < u2_length; len++) {
1368                 /* next unicode character */
1369                 ch = *text++;
1370           
1371                 /* determine bytes required to store unicode character as utf */
1372                 if (ch && (ch < 0x80)) 
1373                         result_len++;
1374                 else if (ch < 0x800)
1375                         result_len += 2;        
1376                 else 
1377                         result_len += 3;        
1378         }
1379
1380     return result_len;
1381 }
1382
1383
1384 /* utf_copy ********************************************************************
1385
1386    Copy the given utf string byte-for-byte to a buffer.
1387
1388    IN:
1389       buffer.......the buffer
1390           u............the utf string
1391
1392 *******************************************************************************/
1393
1394 void utf_copy(char *buffer, utf *u)
1395 {
1396         /* our utf strings are zero-terminated (done by utf_new) */
1397         MCOPY(buffer, u->text, char, u->blength + 1);
1398 }
1399
1400
1401 /* utf_cat *********************************************************************
1402
1403    Append the given utf string byte-for-byte to a buffer.
1404
1405    IN:
1406       buffer.......the buffer
1407           u............the utf string
1408
1409 *******************************************************************************/
1410
1411 void utf_cat(char *buffer, utf *u)
1412 {
1413         /* our utf strings are zero-terminated (done by utf_new) */
1414         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1415 }
1416
1417
1418 /* utf_copy_classname **********************************************************
1419
1420    Copy the given utf classname byte-for-byte to a buffer.
1421    '/' is replaced by '.'
1422
1423    IN:
1424       buffer.......the buffer
1425           u............the utf string
1426
1427 *******************************************************************************/
1428
1429 void utf_copy_classname(char *buffer, utf *u)
1430 {
1431         char *bufptr;
1432         char *srcptr;
1433         char *endptr;
1434         char ch;
1435
1436         bufptr = buffer;
1437         srcptr = u->text;
1438         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1439
1440         while (srcptr != endptr) {
1441                 ch = *srcptr++;
1442                 if (ch == '/')
1443                         ch = '.';
1444                 *bufptr++ = ch;
1445         }
1446 }
1447
1448
1449 /* utf_cat *********************************************************************
1450
1451    Append the given utf classname byte-for-byte to a buffer.
1452    '/' is replaced by '.'
1453
1454    IN:
1455       buffer.......the buffer
1456           u............the utf string
1457
1458 *******************************************************************************/
1459
1460 void utf_cat_classname(char *buffer, utf *u)
1461 {
1462         utf_copy_classname(buffer + strlen(buffer), u);
1463 }
1464
1465 /* utf_display_printable_ascii *************************************************
1466
1467    Write utf symbol to stdout (for debugging purposes).
1468    Non-printable and non-ASCII characters are printed as '?'.
1469
1470 *******************************************************************************/
1471
1472 void utf_display_printable_ascii(utf *u)
1473 {
1474         char *endpos;                       /* points behind utf string           */
1475         char *utf_ptr;                      /* current position in utf text       */
1476
1477         if (u == NULL) {
1478                 printf("NULL");
1479                 fflush(stdout);
1480                 return;
1481         }
1482
1483         endpos = UTF_END(u);
1484         utf_ptr = u->text;
1485
1486         while (utf_ptr < endpos) {
1487                 /* read next unicode character */
1488
1489                 u2 c = utf_nextu2(&utf_ptr);
1490
1491                 if ((c >= 32) && (c <= 127))
1492                         printf("%c", c);
1493                 else
1494                         printf("?");
1495         }
1496
1497         fflush(stdout);
1498 }
1499
1500
1501 /* utf_display_printable_ascii_classname ***************************************
1502
1503    Write utf symbol to stdout with `/' converted to `.' (for debugging
1504    purposes).
1505    Non-printable and non-ASCII characters are printed as '?'.
1506
1507 *******************************************************************************/
1508
1509 void utf_display_printable_ascii_classname(utf *u)
1510 {
1511         char *endpos;                       /* points behind utf string           */
1512         char *utf_ptr;                      /* current position in utf text       */
1513
1514         if (u == NULL) {
1515                 printf("NULL");
1516                 fflush(stdout);
1517                 return;
1518         }
1519
1520         endpos = UTF_END(u);
1521         utf_ptr = u->text;
1522
1523         while (utf_ptr < endpos) {
1524                 /* read next unicode character */
1525
1526                 u2 c = utf_nextu2(&utf_ptr);
1527
1528                 if (c == '/')
1529                         c = '.';
1530
1531                 if ((c >= 32) && (c <= 127))
1532                         printf("%c", c);
1533                 else
1534                         printf("?");
1535         }
1536
1537         fflush(stdout);
1538 }
1539
1540
1541 /* utf_sprint_convert_to_latin1 ************************************************
1542         
1543    Write utf symbol into c-string (for debugging purposes).
1544    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1545    invalid results.
1546
1547 *******************************************************************************/
1548
1549 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1550 {
1551         char *endpos;                       /* points behind utf string           */
1552         char *utf_ptr;                      /* current position in utf text       */
1553         u2 pos = 0;                         /* position in c-string               */
1554
1555         if (!u) {
1556                 strcpy(buffer, "NULL");
1557                 return;
1558         }
1559
1560         endpos = UTF_END(u);
1561         utf_ptr = u->text;
1562
1563         while (utf_ptr < endpos) 
1564                 /* copy next unicode character */       
1565                 buffer[pos++] = utf_nextu2(&utf_ptr);
1566
1567         /* terminate string */
1568         buffer[pos] = '\0';
1569 }
1570
1571
1572 /* utf_sprint_convert_to_latin1_classname **************************************
1573         
1574    Write utf symbol into c-string with `/' converted to `.' (for debugging
1575    purposes).
1576    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1577    invalid results.
1578
1579 *******************************************************************************/
1580
1581 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1582 {
1583         char *endpos;                       /* points behind utf string           */
1584         char *utf_ptr;                      /* current position in utf text       */
1585         u2 pos = 0;                         /* position in c-string               */
1586
1587         if (!u) {
1588                 strcpy(buffer, "NULL");
1589                 return;
1590         }
1591
1592         endpos = UTF_END(u);
1593         utf_ptr = u->text;
1594
1595         while (utf_ptr < endpos) {
1596                 /* copy next unicode character */       
1597                 u2 c = utf_nextu2(&utf_ptr);
1598                 if (c == '/') c = '.';
1599                 buffer[pos++] = c;
1600         }
1601
1602         /* terminate string */
1603         buffer[pos] = '\0';
1604 }
1605
1606
1607 /* utf_strcat_convert_to_latin1 ************************************************
1608         
1609    Like libc strcat, but uses an utf8 string.
1610    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1611    invalid results.
1612
1613 *******************************************************************************/
1614
1615 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1616 {
1617         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1618 }
1619
1620
1621 /* utf_strcat_convert_to_latin1_classname **************************************
1622         
1623    Like libc strcat, but uses an utf8 string.
1624    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1625    invalid results.
1626
1627 *******************************************************************************/
1628
1629 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1630 {
1631         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1632 }
1633
1634
1635 /* utf_fprint_printable_ascii **************************************************
1636         
1637    Write utf symbol into file.
1638    Non-printable and non-ASCII characters are printed as '?'.
1639
1640 *******************************************************************************/
1641
1642 void utf_fprint_printable_ascii(FILE *file, utf *u)
1643 {
1644         char *endpos;                       /* points behind utf string           */
1645         char *utf_ptr;                      /* current position in utf text       */
1646
1647         if (!u)
1648                 return;
1649
1650         endpos = UTF_END(u);
1651         utf_ptr = u->text;
1652
1653         while (utf_ptr < endpos) { 
1654                 /* read next unicode character */                
1655                 u2 c = utf_nextu2(&utf_ptr);                            
1656
1657                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1658                 else fprintf(file, "?");
1659         }
1660 }
1661
1662
1663 /* utf_fprint_printable_ascii_classname ****************************************
1664         
1665    Write utf symbol into file with `/' converted to `.'.
1666    Non-printable and non-ASCII characters are printed as '?'.
1667
1668 *******************************************************************************/
1669
1670 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1671 {
1672         char *endpos;                       /* points behind utf string           */
1673         char *utf_ptr;                      /* current position in utf text       */
1674
1675     if (!u)
1676                 return;
1677
1678         endpos = UTF_END(u);
1679         utf_ptr = u->text;
1680
1681         while (utf_ptr < endpos) { 
1682                 /* read next unicode character */                
1683                 u2 c = utf_nextu2(&utf_ptr);                            
1684                 if (c == '/') c = '.';
1685
1686                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1687                 else fprintf(file, "?");
1688         }
1689 }
1690
1691
1692 /* is_valid_utf ****************************************************************
1693
1694    Return true if the given string is a valid UTF-8 string.
1695
1696    utf_ptr...points to first character
1697    end_pos...points after last character
1698
1699 *******************************************************************************/
1700
1701 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1702
1703 bool is_valid_utf(char *utf_ptr, char *end_pos)
1704 {
1705         int bytes;
1706         int len,i;
1707         char c;
1708         unsigned long v;
1709
1710         if (end_pos < utf_ptr) return false;
1711         bytes = end_pos - utf_ptr;
1712         while (bytes--) {
1713                 c = *utf_ptr++;
1714
1715                 if (!c) return false;                     /* 0x00 is not allowed */
1716                 if ((c & 0x80) == 0) continue;            /* ASCII */
1717
1718                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1719                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1720                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1721                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1722                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1723                 else return false;                        /* invalid leading byte */
1724
1725                 if (len > 2) return false;                /* Java limitation */
1726
1727                 v = (unsigned long)c & (0x3f >> len);
1728                 
1729                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1730
1731                 for (i = len; i--; ) {
1732                         c = *utf_ptr++;
1733                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1734                                 return false;
1735                         v = (v << 6) | (c & 0x3f);
1736                 }
1737
1738                 if (v == 0) {
1739                         if (len != 1) return false;           /* Java special */
1740
1741                 } else {
1742                         /* Sun Java seems to allow overlong UTF-8 encodings */
1743                         
1744                         /* if (v < min_codepoint[len]) */
1745                                 /* XXX throw exception? */
1746                 }
1747
1748                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1749                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1750
1751                 /* even these seem to be allowed */
1752                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1753         }
1754
1755         return true;
1756 }
1757
1758
1759 /* is_valid_name ***************************************************************
1760
1761    Return true if the given string may be used as a class/field/method
1762    name. (Currently this only disallows empty strings and control
1763    characters.)
1764
1765    NOTE: The string is assumed to have passed is_valid_utf!
1766
1767    utf_ptr...points to first character
1768    end_pos...points after last character
1769
1770 *******************************************************************************/
1771
1772 bool is_valid_name(char *utf_ptr, char *end_pos)
1773 {
1774         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1775
1776         while (utf_ptr < end_pos) {
1777                 unsigned char c = *utf_ptr++;
1778
1779                 if (c < 0x20) return false; /* disallow control characters */
1780                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1781                         return false;
1782         }
1783
1784         return true;
1785 }
1786
1787 bool is_valid_name_utf(utf *u)
1788 {
1789         return is_valid_name(u->text, UTF_END(u));
1790 }
1791
1792
1793 /* utf_show ********************************************************************
1794
1795    Writes the utf symbols in the utfhash to stdout and displays the
1796    number of external hash chains grouped according to the chainlength
1797    (for debugging purposes).
1798
1799 *******************************************************************************/
1800
1801 #if !defined(NDEBUG)
1802 void utf_show(void)
1803 {
1804
1805 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1806
1807         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1808         u4 max_chainlength = 0;      /* maximum length of the chains */
1809         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1810         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1811         u4 i;
1812
1813         printf("UTF-HASH:\n");
1814
1815         /* show element of utf-hashtable */
1816
1817         for (i = 0; i < hashtable_utf->size; i++) {
1818                 utf *u = hashtable_utf->ptr[i];
1819
1820                 if (u) {
1821                         printf("SLOT %d: ", (int) i);
1822
1823                         while (u) {
1824                                 printf("'");
1825                                 utf_display_printable_ascii(u);
1826                                 printf("' ");
1827                                 u = u->hashlink;
1828                         }       
1829                         printf("\n");
1830                 }
1831         }
1832
1833         printf("UTF-HASH: %d slots for %d entries\n", 
1834                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1835
1836         if (hashtable_utf->entries == 0)
1837                 return;
1838
1839         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1840
1841         for (i=0;i<CHAIN_LIMIT;i++)
1842                 chain_count[i]=0;
1843
1844         /* count numbers of hashchains according to their length */
1845         for (i=0; i<hashtable_utf->size; i++) {
1846                   
1847                 utf *u = (utf*) hashtable_utf->ptr[i];
1848                 u4 chain_length = 0;
1849
1850                 /* determine chainlength */
1851                 while (u) {
1852                         u = u->hashlink;
1853                         chain_length++;
1854                 }
1855
1856                 /* update sum of all chainlengths */
1857                 sum_chainlength+=chain_length;
1858
1859                 /* determine the maximum length of the chains */
1860                 if (chain_length>max_chainlength)
1861                         max_chainlength = chain_length;
1862
1863                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1864                 if (chain_length>=CHAIN_LIMIT) {
1865                         beyond_limit+=chain_length;
1866                         chain_length=CHAIN_LIMIT-1;
1867                 }
1868
1869                 /* update number of hashchains of current length */
1870                 chain_count[chain_length]++;
1871         }
1872
1873         /* display results */  
1874         for (i=1;i<CHAIN_LIMIT-1;i++) 
1875                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1876           
1877         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1878
1879
1880         printf("max. chainlength:%5d\n",max_chainlength);
1881
1882         /* avg. chainlength = sum of chainlengths / number of chains */
1883         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1884 }
1885 #endif /* !defined(NDEBUG) */
1886
1887
1888 /*
1889  * These are local overrides for various environment variables in Emacs.
1890  * Please do not remove this and leave it at the end of the file, where
1891  * Emacs will automagically detect them.
1892  * ---------------------------------------------------------------------
1893  * Local variables:
1894  * mode: c
1895  * indent-tabs-mode: t
1896  * c-basic-offset: 4
1897  * tab-width: 4
1898  * End:
1899  * vim:noexpandtab:sw=4:ts=4:
1900  */