Merged revisions 8056-8122 via svnmerge from
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    $Id: utf8.c 8123 2007-06-20 23:50:55Z michi $
26
27 */
28
29
30 #include "config.h"
31
32 #include <string.h>
33 #include <assert.h>
34
35 #include "vm/types.h"
36
37 #include "mm/memory.h"
38
39 #include "threads/lock-common.h"
40
41 #include "toolbox/hashtable.h"
42
43 #include "vm/exceptions.h"
44
45 #include "vmcore/options.h"
46
47 #if defined(ENABLE_STATISTICS)
48 # include "vmcore/statistics.h"
49 #endif
50
51 #include "vmcore/utf8.h"
52
53
54 /* global variables ***********************************************************/
55
56 /* hashsize must be power of 2 */
57
58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
59
60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
61
62
63 /* utf-symbols for pointer comparison of frequently used strings **************/
64
65 utf *utf_java_lang_Object;
66
67 utf *utf_java_lang_Class;
68 utf *utf_java_lang_ClassLoader;
69 utf *utf_java_lang_Cloneable;
70 utf *utf_java_lang_SecurityManager;
71 utf *utf_java_lang_String;
72 utf *utf_java_lang_System;
73 utf *utf_java_lang_ThreadGroup;
74 utf *utf_java_lang_ref_SoftReference;
75 utf *utf_java_lang_ref_WeakReference;
76 utf *utf_java_lang_ref_PhantomReference;
77 utf *utf_java_io_Serializable;
78
79 utf *utf_java_lang_Throwable;
80 utf *utf_java_lang_Error;
81
82 utf *utf_java_lang_AbstractMethodError;
83 utf *utf_java_lang_ClassCircularityError;
84 utf *utf_java_lang_ClassFormatError;
85 utf *utf_java_lang_ExceptionInInitializerError;
86 utf *utf_java_lang_IncompatibleClassChangeError;
87 utf *utf_java_lang_InstantiationError;
88 utf *utf_java_lang_InternalError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchFieldError;
92 utf *utf_java_lang_NoSuchMethodError;
93 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_UnsatisfiedLinkError;
95 utf *utf_java_lang_UnsupportedClassVersionError;
96 utf *utf_java_lang_VerifyError;
97 utf *utf_java_lang_VirtualMachineError;
98
99 #if defined(WITH_CLASSPATH_GNU)
100 utf *utf_java_lang_VMThrowable;
101 #endif
102
103 utf *utf_java_lang_Exception;
104
105 utf *utf_java_lang_ArithmeticException;
106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
107 utf *utf_java_lang_ArrayStoreException;
108 utf *utf_java_lang_ClassCastException;
109 utf *utf_java_lang_ClassNotFoundException;
110 utf *utf_java_lang_CloneNotSupportedException;
111 utf *utf_java_lang_IllegalAccessException;
112 utf *utf_java_lang_IllegalArgumentException;
113 utf *utf_java_lang_IllegalMonitorStateException;
114 utf *utf_java_lang_InstantiationException;
115 utf *utf_java_lang_InterruptedException;
116 utf *utf_java_lang_NegativeArraySizeException;
117 utf *utf_java_lang_NullPointerException;
118 utf *utf_java_lang_StringIndexOutOfBoundsException;
119
120 utf *utf_java_lang_reflect_InvocationTargetException;
121
122 utf *utf_java_security_PrivilegedActionException;
123
124 #if defined(ENABLE_JAVASE)
125 utf* utf_java_lang_Void;
126 #endif
127
128 utf* utf_java_lang_Boolean;
129 utf* utf_java_lang_Byte;
130 utf* utf_java_lang_Character;
131 utf* utf_java_lang_Short;
132 utf* utf_java_lang_Integer;
133 utf* utf_java_lang_Long;
134 utf* utf_java_lang_Float;
135 utf* utf_java_lang_Double;
136
137 #if defined(ENABLE_JAVASE)
138 utf *utf_java_lang_StackTraceElement;
139 utf *utf_java_lang_reflect_Constructor;
140 utf *utf_java_lang_reflect_Field;
141 utf *utf_java_lang_reflect_Method;
142 utf *utf_java_util_Vector;
143 #endif
144
145 utf *utf_InnerClasses;                  /* InnerClasses                       */
146 utf *utf_ConstantValue;                 /* ConstantValue                      */
147 utf *utf_Code;                          /* Code                               */
148 utf *utf_Exceptions;                    /* Exceptions                         */
149 utf *utf_LineNumberTable;               /* LineNumberTable                    */
150 utf *utf_SourceFile;                    /* SourceFile                         */
151
152 #if defined(ENABLE_JAVASE)
153 utf *utf_EnclosingMethod;
154 utf *utf_Signature;
155 utf *utf_RuntimeVisibleAnnotations;
156 utf *utf_StackMapTable;
157 #endif
158
159 utf *utf_init;                          /* <init>                             */
160 utf *utf_clinit;                        /* <clinit>                           */
161 utf *utf_clone;                         /* clone                              */
162 utf *utf_finalize;                      /* finalize                           */
163 utf *utf_run;                           /* run                                */
164
165 utf *utf_add;
166 utf *utf_remove;
167 utf *utf_addThread;
168 utf *utf_removeThread;
169 utf *utf_put;
170 utf *utf_get;
171 utf *utf_value;
172
173 utf *utf_fillInStackTrace;
174 utf *utf_getSystemClassLoader;
175 utf *utf_initCause;
176 utf *utf_loadClass;
177 utf *utf_printStackTrace;
178
179 utf *utf_division_by_zero;
180
181 utf *utf_Z;                             /* Z                                  */
182 utf *utf_B;                             /* B                                  */
183 utf *utf_C;                             /* C                                  */
184 utf *utf_S;                             /* S                                  */
185 utf *utf_I;                             /* I                                  */
186 utf *utf_J;                             /* J                                  */
187 utf *utf_F;                             /* F                                  */
188 utf *utf_D;                             /* D                                  */
189
190 utf *utf_void__void;                    /* ()V                                */
191 utf *utf_boolean__void;                 /* (Z)V                               */
192 utf *utf_byte__void;                    /* (B)V                               */
193 utf *utf_char__void;                    /* (C)V                               */
194 utf *utf_short__void;                   /* (S)V                               */
195 utf *utf_int__void;                     /* (I)V                               */
196 utf *utf_long__void;                    /* (J)V                               */
197 utf *utf_float__void;                   /* (F)V                               */
198 utf *utf_double__void;                  /* (D)V                               */
199
200 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
201 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
202 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
203 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
204 utf *utf_java_lang_Object__java_lang_Object;
205 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
206 utf *utf_java_lang_String__java_lang_Class;
207 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
208 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
209 utf *utf_java_lang_Throwable__java_lang_Throwable;
210
211 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
212 utf *utf_null;
213 utf *array_packagename;
214
215
216 /* utf_init ********************************************************************
217
218    Initializes the utf8 subsystem.
219
220 *******************************************************************************/
221
222 bool utf8_init(void)
223 {
224         /* create utf8 hashtable */
225
226         hashtable_utf = NEW(hashtable);
227
228         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
229
230 #if defined(ENABLE_STATISTICS)
231         if (opt_stat)
232                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
233 #endif
234
235         /* create utf-symbols for pointer comparison of frequently used strings */
236
237         utf_java_lang_Object           = utf_new_char("java/lang/Object");
238
239         utf_java_lang_Class            = utf_new_char("java/lang/Class");
240         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
241         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
242         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
243         utf_java_lang_String           = utf_new_char("java/lang/String");
244         utf_java_lang_System           = utf_new_char("java/lang/System");
245         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
246
247         utf_java_lang_ref_SoftReference =
248                 utf_new_char("java/lang/ref/SoftReference");
249
250         utf_java_lang_ref_WeakReference =
251                 utf_new_char("java/lang/ref/WeakReference");
252
253         utf_java_lang_ref_PhantomReference =
254                 utf_new_char("java/lang/ref/PhantomReference");
255
256         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
257
258         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
259         utf_java_lang_Error            = utf_new_char("java/lang/Error");
260
261         utf_java_lang_ClassCircularityError =
262                 utf_new_char("java/lang/ClassCircularityError");
263
264         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
265
266         utf_java_lang_ExceptionInInitializerError =
267                 utf_new_char("java/lang/ExceptionInInitializerError");
268
269         utf_java_lang_IncompatibleClassChangeError =
270                 utf_new_char("java/lang/IncompatibleClassChangeError");
271
272         utf_java_lang_InstantiationError =
273                 utf_new_char("java/lang/InstantiationError");
274
275         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
276         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
277
278         utf_java_lang_NoClassDefFoundError =
279                 utf_new_char("java/lang/NoClassDefFoundError");
280
281         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
282
283         utf_java_lang_UnsatisfiedLinkError =
284                 utf_new_char("java/lang/UnsatisfiedLinkError");
285
286         utf_java_lang_UnsupportedClassVersionError =
287                 utf_new_char("java/lang/UnsupportedClassVersionError");
288
289         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
290
291         utf_java_lang_VirtualMachineError =
292                 utf_new_char("java/lang/VirtualMachineError");
293
294 #if defined(ENABLE_JAVASE)
295         utf_java_lang_AbstractMethodError =
296                 utf_new_char("java/lang/AbstractMethodError");
297
298         utf_java_lang_NoSuchFieldError =
299                 utf_new_char("java/lang/NoSuchFieldError");
300
301         utf_java_lang_NoSuchMethodError =
302                 utf_new_char("java/lang/NoSuchMethodError");
303 #endif
304
305 #if defined(WITH_CLASSPATH_GNU)
306         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
307 #endif
308
309         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
310
311         utf_java_lang_ArithmeticException =
312                 utf_new_char("java/lang/ArithmeticException");
313
314         utf_java_lang_ArrayIndexOutOfBoundsException =
315                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
316
317         utf_java_lang_ArrayStoreException =
318                 utf_new_char("java/lang/ArrayStoreException");
319
320         utf_java_lang_ClassCastException =
321                 utf_new_char("java/lang/ClassCastException");
322
323         utf_java_lang_ClassNotFoundException =
324                 utf_new_char("java/lang/ClassNotFoundException");
325
326         utf_java_lang_CloneNotSupportedException =
327                 utf_new_char("java/lang/CloneNotSupportedException");
328
329         utf_java_lang_IllegalAccessException =
330                 utf_new_char("java/lang/IllegalAccessException");
331
332         utf_java_lang_IllegalArgumentException =
333                 utf_new_char("java/lang/IllegalArgumentException");
334
335         utf_java_lang_IllegalMonitorStateException =
336                 utf_new_char("java/lang/IllegalMonitorStateException");
337
338         utf_java_lang_InstantiationException =
339                 utf_new_char("java/lang/InstantiationException");
340
341         utf_java_lang_InterruptedException =
342                 utf_new_char("java/lang/InterruptedException");
343
344         utf_java_lang_NegativeArraySizeException =
345                 utf_new_char("java/lang/NegativeArraySizeException");
346
347         utf_java_lang_NullPointerException =
348                 utf_new_char("java/lang/NullPointerException");
349
350         utf_java_lang_StringIndexOutOfBoundsException =
351                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
352
353         utf_java_lang_reflect_InvocationTargetException =
354                 utf_new_char("java/lang/reflect/InvocationTargetException");
355
356         utf_java_security_PrivilegedActionException =
357                 utf_new_char("java/security/PrivilegedActionException");
358  
359 #if defined(ENABLE_JAVASE)
360         utf_java_lang_Void             = utf_new_char("java/lang/Void");
361 #endif
362
363         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
364         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
365         utf_java_lang_Character        = utf_new_char("java/lang/Character");
366         utf_java_lang_Short            = utf_new_char("java/lang/Short");
367         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
368         utf_java_lang_Long             = utf_new_char("java/lang/Long");
369         utf_java_lang_Float            = utf_new_char("java/lang/Float");
370         utf_java_lang_Double           = utf_new_char("java/lang/Double");
371
372 #if defined(ENABLE_JAVASE)
373         utf_java_lang_StackTraceElement =
374                 utf_new_char("java/lang/StackTraceElement");
375
376         utf_java_lang_reflect_Constructor =
377                 utf_new_char("java/lang/reflect/Constructor");
378
379         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
380         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
381         utf_java_util_Vector           = utf_new_char("java/util/Vector");
382 #endif
383
384         utf_InnerClasses               = utf_new_char("InnerClasses");
385         utf_ConstantValue              = utf_new_char("ConstantValue");
386         utf_Code                       = utf_new_char("Code");
387         utf_Exceptions                 = utf_new_char("Exceptions");
388         utf_LineNumberTable            = utf_new_char("LineNumberTable");
389         utf_SourceFile                 = utf_new_char("SourceFile");
390
391 #if defined(ENABLE_JAVASE)
392         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
393         utf_Signature                  = utf_new_char("Signature");
394         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
395         utf_StackMapTable              = utf_new_char("StackMapTable");
396 #endif
397
398         utf_init                           = utf_new_char("<init>");
399         utf_clinit                         = utf_new_char("<clinit>");
400         utf_clone                      = utf_new_char("clone");
401         utf_finalize                   = utf_new_char("finalize");
402         utf_run                        = utf_new_char("run");
403
404         utf_add                        = utf_new_char("add");
405         utf_remove                     = utf_new_char("remove");
406         utf_addThread                  = utf_new_char("addThread");
407         utf_removeThread               = utf_new_char("removeThread");
408         utf_put                        = utf_new_char("put");
409         utf_get                        = utf_new_char("get");
410         utf_value                      = utf_new_char("value");
411
412         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
413         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
414         utf_initCause                  = utf_new_char("initCause");
415         utf_loadClass                  = utf_new_char("loadClass");
416         utf_printStackTrace            = utf_new_char("printStackTrace");
417
418         utf_division_by_zero           = utf_new_char("/ by zero");
419
420         utf_Z                          = utf_new_char("Z");
421         utf_B                          = utf_new_char("B");
422         utf_C                          = utf_new_char("C");
423         utf_S                          = utf_new_char("S");
424         utf_I                          = utf_new_char("I");
425         utf_J                          = utf_new_char("J");
426         utf_F                          = utf_new_char("F");
427         utf_D                          = utf_new_char("D");
428
429         utf_void__void                 = utf_new_char("()V");
430         utf_boolean__void              = utf_new_char("(Z)V");
431         utf_byte__void                 = utf_new_char("(B)V");
432         utf_char__void                 = utf_new_char("(C)V");
433         utf_short__void                = utf_new_char("(S)V");
434         utf_int__void                  = utf_new_char("(I)V");
435         utf_long__void                 = utf_new_char("(J)V");
436         utf_float__void                = utf_new_char("(F)V");
437         utf_double__void               = utf_new_char("(D)V");
438         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
439         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
440
441         utf_void__java_lang_ClassLoader =
442                 utf_new_char("()Ljava/lang/ClassLoader;");
443
444         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
445
446         utf_java_lang_Object__java_lang_Object =
447                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
448
449         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
450
451         utf_java_lang_String__java_lang_Class =
452                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
453
454         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
455         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
456
457         utf_java_lang_Throwable__java_lang_Throwable =
458                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
459
460         utf_null                       = utf_new_char("null");
461         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
462         array_packagename              = utf_new_char("\t<the array package>");
463
464         /* everything's ok */
465
466         return true;
467 }
468
469
470 /* utf_hashkey *****************************************************************
471
472    The hashkey is computed from the utf-text by using up to 8
473    characters.  For utf-symbols longer than 15 characters 3 characters
474    are taken from the beginning and the end, 2 characters are taken
475    from the middle.
476
477 *******************************************************************************/
478
479 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
480 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
481
482 u4 utf_hashkey(const char *text, u4 length)
483 {
484         const char *start_pos = text;       /* pointer to utf text                */
485         u4 a;
486
487         switch (length) {
488         case 0: /* empty string */
489                 return 0;
490
491         case 1: return fbs(0);
492         case 2: return fbs(0) ^ nbs(3);
493         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
494         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
495         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
496         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
497         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
498         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
499
500         case 9:
501                 a = fbs(0);
502                 a ^= nbs(1);
503                 a ^= nbs(2);
504                 text++;
505                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
506
507         case 10:
508                 a = fbs(0);
509                 text++;
510                 a ^= nbs(2);
511                 a ^= nbs(3);
512                 a ^= nbs(4);
513                 text++;
514                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
515
516         case 11:
517                 a = fbs(0);
518                 text++;
519                 a ^= nbs(2);
520                 a ^= nbs(3);
521                 a ^= nbs(4);
522                 text++;
523                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
524
525         case 12:
526                 a = fbs(0);
527                 text += 2;
528                 a ^= nbs(2);
529                 a ^= nbs(3);
530                 text++;
531                 a ^= nbs(5);
532                 a ^= nbs(6);
533                 a ^= nbs(7);
534                 text++;
535                 return a ^ nbs(9) ^ nbs(10);
536
537         case 13:
538                 a = fbs(0);
539                 a ^= nbs(1);
540                 text++;
541                 a ^= nbs(3);
542                 a ^= nbs(4);
543                 text += 2;      
544                 a ^= nbs(7);
545                 a ^= nbs(8);
546                 text += 2;
547                 return a ^ nbs(9) ^ nbs(10);
548
549         case 14:
550                 a = fbs(0);
551                 text += 2;      
552                 a ^= nbs(3);
553                 a ^= nbs(4);
554                 text += 2;      
555                 a ^= nbs(7);
556                 a ^= nbs(8);
557                 text += 2;
558                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
559
560         case 15:
561                 a = fbs(0);
562                 text += 2;      
563                 a ^= nbs(3);
564                 a ^= nbs(4);
565                 text += 2;      
566                 a ^= nbs(7);
567                 a ^= nbs(8);
568                 text += 2;
569                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
570
571         default:  /* 3 characters from beginning */
572                 a = fbs(0);
573                 text += 2;
574                 a ^= nbs(3);
575                 a ^= nbs(4);
576
577                 /* 2 characters from middle */
578                 text = start_pos + (length / 2);
579                 a ^= fbs(5);
580                 text += 2;
581                 a ^= nbs(6);    
582
583                 /* 3 characters from end */
584                 text = start_pos + length - 4;
585
586                 a ^= fbs(7);
587                 text++;
588
589                 return a ^ nbs(10) ^ nbs(11);
590     }
591 }
592
593 /* utf_full_hashkey ************************************************************
594
595    This function computes a hash value using all bytes in the string.
596
597    The algorithm is the "One-at-a-time" algorithm as published
598    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
599
600 *******************************************************************************/
601
602 u4 utf_full_hashkey(const char *text, u4 length)
603 {
604         register const unsigned char *p = (const unsigned char *) text;
605         register u4 hash;
606         register u4 i;
607
608         hash = 0;
609         for (i=length; i--;)
610         {
611             hash += *p++;
612             hash += (hash << 10);
613             hash ^= (hash >> 6);
614         }
615         hash += (hash << 3);
616         hash ^= (hash >> 11);
617         hash += (hash << 15);
618
619         return hash;
620 }
621
622 /* unicode_hashkey *************************************************************
623
624    Compute the hashkey of a unicode string.
625
626 *******************************************************************************/
627
628 u4 unicode_hashkey(u2 *text, u2 len)
629 {
630         return utf_hashkey((char *) text, len);
631 }
632
633
634 /* utf_new *********************************************************************
635
636    Creates a new utf-symbol, the text of the symbol is passed as a
637    u1-array. The function searches the utf-hashtable for a utf-symbol
638    with this text. On success the element returned, otherwise a new
639    hashtable element is created.
640
641    If the number of entries in the hashtable exceeds twice the size of
642    the hashtable slots a reorganization of the hashtable is done and
643    the utf symbols are copied to a new hashtable with doubled size.
644
645 *******************************************************************************/
646
647 utf *utf_new(const char *text, u2 length)
648 {
649         u4 key;                             /* hashkey computed from utf-text     */
650         u4 slot;                            /* slot in hashtable                  */
651         utf *u;                             /* hashtable element                  */
652         u2 i;
653
654         LOCK_MONITOR_ENTER(hashtable_utf->header);
655
656 #if defined(ENABLE_STATISTICS)
657         if (opt_stat)
658                 count_utf_new++;
659 #endif
660
661         key  = utf_hashkey(text, length);
662         slot = key & (hashtable_utf->size - 1);
663         u    = hashtable_utf->ptr[slot];
664
665         /* search external hash chain for utf-symbol */
666
667         while (u) {
668                 if (u->blength == length) {
669                         /* compare text of hashtable elements */
670
671                         for (i = 0; i < length; i++)
672                                 if (text[i] != u->text[i])
673                                         goto nomatch;
674                         
675 #if defined(ENABLE_STATISTICS)
676                         if (opt_stat)
677                                 count_utf_new_found++;
678 #endif
679
680                         /* symbol found in hashtable */
681
682                         LOCK_MONITOR_EXIT(hashtable_utf->header);
683
684                         return u;
685                 }
686
687         nomatch:
688                 u = u->hashlink; /* next element in external chain */
689         }
690
691         /* location in hashtable found, create new utf element */
692
693         u = NEW(utf);
694
695         u->blength  = length;               /* length in bytes of utfstring       */
696         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
697         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
698
699         memcpy(u->text, text, length);      /* copy utf-text                      */
700         u->text[length] = '\0';
701
702 #if defined(ENABLE_STATISTICS)
703         if (opt_stat)
704                 count_utf_len += sizeof(utf) + length + 1;
705 #endif
706
707         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
708         hashtable_utf->entries++;           /* update number of entries           */
709
710         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
711
712         /* reorganization of hashtable, average length of the external
713            chains is approx. 2 */
714
715                 hashtable *newhash;                              /* the new hashtable */
716                 u4         i;
717                 utf       *u;
718                 utf       *nextu;
719                 u4         slot;
720
721                 /* create new hashtable, double the size */
722
723                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
724
725 #if defined(ENABLE_STATISTICS)
726                 if (opt_stat)
727                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
728 #endif
729
730                 /* transfer elements to new hashtable */
731
732                 for (i = 0; i < hashtable_utf->size; i++) {
733                         u = hashtable_utf->ptr[i];
734
735                         while (u) {
736                                 nextu = u->hashlink;
737                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
738                                                 
739                                 u->hashlink = (utf *) newhash->ptr[slot];
740                                 newhash->ptr[slot] = u;
741
742                                 /* follow link in external hash chain */
743
744                                 u = nextu;
745                         }
746                 }
747         
748                 /* dispose old table */
749
750                 hashtable_free(hashtable_utf);
751
752                 hashtable_utf = newhash;
753         }
754
755         LOCK_MONITOR_EXIT(hashtable_utf->header);
756
757         return u;
758 }
759
760
761 /* utf_new_u2 ******************************************************************
762
763    Make utf symbol from u2 array, if isclassname is true '.' is
764    replaced by '/'.
765
766 *******************************************************************************/
767
768 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
769 {
770         char *buffer;                   /* memory buffer for  unicode characters  */
771         char *pos;                      /* pointer to current position in buffer  */
772         u4 left;                        /* unicode characters left                */
773         u4 buflength;                   /* utf length in bytes of the u2 array    */
774         utf *result;                    /* resulting utf-string                   */
775         int i;          
776
777         /* determine utf length in bytes and allocate memory */
778
779         buflength = u2_utflength(unicode_pos, unicode_length); 
780         buffer    = MNEW(char, buflength);
781  
782         left = buflength;
783         pos  = buffer;
784
785         for (i = 0; i++ < unicode_length; unicode_pos++) {
786                 /* next unicode character */
787                 u2 c = *unicode_pos;
788                 
789                 if ((c != 0) && (c < 0x80)) {
790                         /* 1 character */       
791                         left--;
792                 if ((int) left < 0) break;
793                         /* convert classname */
794                         if (isclassname && c == '.')
795                                 *pos++ = '/';
796                         else
797                                 *pos++ = (char) c;
798
799                 } else if (c < 0x800) {             
800                         /* 2 characters */                              
801                 unsigned char high = c >> 6;
802                 unsigned char low  = c & 0x3F;
803                         left = left - 2;
804                 if ((int) left < 0) break;
805                 *pos++ = high | 0xC0; 
806                 *pos++ = low  | 0x80;     
807
808                 } else {         
809                 /* 3 characters */                              
810                 char low  = c & 0x3f;
811                 char mid  = (c >> 6) & 0x3F;
812                 char high = c >> 12;
813                         left = left - 3;
814                 if ((int) left < 0) break;
815                 *pos++ = high | 0xE0; 
816                 *pos++ = mid  | 0x80;  
817                 *pos++ = low  | 0x80;   
818                 }
819         }
820         
821         /* insert utf-string into symbol-table */
822         result = utf_new(buffer,buflength);
823
824         MFREE(buffer, char, buflength);
825
826         return result;
827 }
828
829
830 /* utf_new_char ****************************************************************
831
832    Creates a new utf symbol, the text for this symbol is passed as a
833    c-string ( = char* ).
834
835 *******************************************************************************/
836
837 utf *utf_new_char(const char *text)
838 {
839         return utf_new(text, strlen(text));
840 }
841
842
843 /* utf_new_char_classname ******************************************************
844
845    Creates a new utf symbol, the text for this symbol is passed as a
846    c-string ( = char* ) "." characters are going to be replaced by
847    "/". Since the above function is used often, this is a separte
848    function, instead of an if.
849
850 *******************************************************************************/
851
852 utf *utf_new_char_classname(const char *text)
853 {
854         if (strchr(text, '.')) {
855                 char *txt = strdup(text);
856                 char *end = txt + strlen(txt);
857                 char *c;
858                 utf *tmpRes;
859
860                 for (c = txt; c < end; c++)
861                         if (*c == '.') *c = '/';
862
863                 tmpRes = utf_new(txt, strlen(txt));
864                 FREE(txt, 0);
865
866                 return tmpRes;
867
868         } else
869                 return utf_new(text, strlen(text));
870 }
871
872
873 /* utf_nextu2 ******************************************************************
874
875    Read the next unicode character from the utf string and increment
876    the utf-string pointer accordingly.
877
878    CAUTION: This function is unsafe for input that was not checked 
879             by is_valid_utf!
880
881 *******************************************************************************/
882
883 u2 utf_nextu2(char **utf_ptr)
884 {
885     /* uncompressed unicode character */
886     u2 unicode_char = 0;
887     /* current position in utf text */  
888     unsigned char *utf = (unsigned char *) (*utf_ptr);
889     /* bytes representing the unicode character */
890     unsigned char ch1, ch2, ch3;
891     /* number of bytes used to represent the unicode character */
892     int len = 0;
893         
894     switch ((ch1 = utf[0]) >> 4) {
895         default: /* 1 byte */
896                 (*utf_ptr)++;
897                 return (u2) ch1;
898         case 0xC: 
899         case 0xD: /* 2 bytes */
900                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
901                         unsigned char high = ch1 & 0x1F;
902                         unsigned char low  = ch2 & 0x3F;
903                         unicode_char = (high << 6) + low;
904                         len = 2;
905                 }
906                 break;
907
908         case 0xE: /* 2 or 3 bytes */
909                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
910                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
911                                 unsigned char low  = ch3 & 0x3f;
912                                 unsigned char mid  = ch2 & 0x3f;
913                                 unsigned char high = ch1 & 0x0f;
914                                 unicode_char = (((high << 6) + mid) << 6) + low;
915                                 len = 3;
916                         } else
917                                 len = 2;                                           
918                 }
919                 break;
920     }
921
922     /* update position in utf-text */
923     *utf_ptr = (char *) (utf + len);
924
925     return unicode_char;
926 }
927
928
929 /* utf_bytes *******************************************************************
930
931    Determine number of bytes (aka. octets) in the utf string.
932
933    IN:
934       u............utf string
935
936    OUT:
937       The number of octets of this utf string.
938           There is _no_ terminating zero included in this count.
939
940 *******************************************************************************/
941
942 u4 utf_bytes(utf *u)
943 {
944         return u->blength;
945 }
946
947
948 /* utf_get_number_of_u2s_for_buffer ********************************************
949
950    Determine number of UTF-16 u2s in the given UTF-8 buffer
951
952    CAUTION: This function is unsafe for input that was not checked 
953             by is_valid_utf!
954
955    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
956    to an array of u2s (UTF-16) and want to know how many of them you will get.
957    All other uses of this function are probably wrong.
958
959    IN:
960       buffer........points to first char in buffer
961           blength.......number of _bytes_ in the buffer
962
963    OUT:
964       the number of u2s needed to hold this string in UTF-16 encoding.
965           There is _no_ terminating zero included in this count.
966
967    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
968    exception.
969
970 *******************************************************************************/
971
972 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
973 {
974         const char *endpos;                 /* points behind utf string           */
975         const char *utf_ptr;                /* current position in utf text       */
976         u4 len = 0;                         /* number of unicode characters       */
977
978         utf_ptr = buffer;
979         endpos = utf_ptr + blength;
980
981         while (utf_ptr < endpos) {
982                 len++;
983                 /* next unicode character */
984                 utf_nextu2((char **)&utf_ptr);
985         }
986
987         assert(utf_ptr == endpos);
988
989         return len;
990 }
991
992
993 /* utf_get_number_of_u2s *******************************************************
994
995    Determine number of UTF-16 u2s in the utf string.
996
997    CAUTION: This function is unsafe for input that was not checked 
998             by is_valid_utf!
999
1000    CAUTION: Use this function *only* when you want to convert a utf string
1001    to an array of u2s and want to know how many of them you will get.
1002    All other uses of this function are probably wrong.
1003
1004    IN:
1005       u............utf string
1006
1007    OUT:
1008       the number of u2s needed to hold this string in UTF-16 encoding.
1009           There is _no_ terminating zero included in this count.
1010           XXX 0 if a NullPointerException has been thrown (see below)
1011
1012 *******************************************************************************/
1013
1014 u4 utf_get_number_of_u2s(utf *u)
1015 {
1016         char *endpos;                       /* points behind utf string           */
1017         char *utf_ptr;                      /* current position in utf text       */
1018         u4 len = 0;                         /* number of unicode characters       */
1019
1020         /* XXX this is probably not checked by most callers! Review this after */
1021         /* the invalid uses of this function have been eliminated */
1022         if (u == NULL) {
1023                 exceptions_throw_nullpointerexception();
1024                 return 0;
1025         }
1026
1027         endpos = UTF_END(u);
1028         utf_ptr = u->text;
1029
1030         while (utf_ptr < endpos) {
1031                 len++;
1032                 /* next unicode character */
1033                 utf_nextu2(&utf_ptr);
1034         }
1035
1036         if (utf_ptr != endpos) {
1037                 /* string ended abruptly */
1038                 exceptions_throw_internalerror("Illegal utf8 string");
1039                 return 0;
1040         }
1041
1042         return len;
1043 }
1044
1045
1046 /* utf8_safe_number_of_u2s *****************************************************
1047
1048    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1049    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1050
1051    This function is safe even for invalid UTF-8 strings.
1052
1053    IN:
1054       text..........zero-terminated(!) UTF-8 string (may be invalid)
1055                         must NOT be NULL
1056           nbytes........strlen(text). (This is needed to completely emulate
1057                         the RI).
1058
1059    OUT:
1060       the number of u2s needed to hold this string in UTF-16 encoding.
1061           There is _no_ terminating zero included in this count.
1062
1063 *******************************************************************************/
1064
1065 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1066         register const unsigned char *t;
1067         register s4 byte;
1068         register s4 len;
1069         register const unsigned char *tlimit;
1070         s4 byte1;
1071         s4 byte2;
1072         s4 byte3;
1073         s4 value;
1074         s4 skip;
1075
1076         assert(text);
1077         assert(nbytes >= 0);
1078
1079         len = 0;
1080         t = (const unsigned char *) text;
1081         tlimit = t + nbytes;
1082
1083         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1084
1085         while (1) {
1086                 byte = *t++;
1087
1088                 if (byte & 0x80) {
1089                         /* highest bit set, non-ASCII character */
1090
1091                         if ((byte & 0xe0) == 0xc0) {
1092                                 /* 2-byte: should be 110..... 10...... ? */
1093
1094                                 if ((*t++ & 0xc0) == 0x80)
1095                                         ; /* valid 2-byte */
1096                                 else
1097                                         t--; /* invalid */
1098                         }
1099                         else if ((byte & 0xf0) == 0xe0) {
1100                                 /* 3-byte: should be 1110.... 10...... 10...... */
1101                                 /*                            ^t                */
1102
1103                                 if (t + 2 > tlimit)
1104                                         return len + 1; /* invalid, stop here */
1105
1106                                 if ((*t++ & 0xc0) == 0x80) {
1107                                         if ((*t++ & 0xc0) == 0x80)
1108                                                 ; /* valid 3-byte */
1109                                         else
1110                                                 t--; /* invalid */
1111                                 }
1112                                 else
1113                                         t--; /* invalid */
1114                         }
1115                         else if ((byte & 0xf8) == 0xf0) {
1116                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1117                                 /*                            ^t                         */
1118
1119                                 if (t + 3 > tlimit)
1120                                         return len + 1; /* invalid, stop here */
1121
1122                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1123                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1124                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1125                                                         /* valid 4-byte UTF-8? */
1126                                                         value = ((byte  & 0x07) << 18)
1127                                                                   | ((byte1 & 0x3f) << 12)
1128                                                                   | ((byte2 & 0x3f) <<  6)
1129                                                                   | ((byte3 & 0x3f)      );
1130
1131                                                         if (value > 0x10FFFF)
1132                                                                 ; /* invalid */
1133                                                         else if (value > 0xFFFF)
1134                                                                 len += 1; /* we need surrogates */
1135                                                         else
1136                                                                 ; /* 16bit suffice */
1137                                                 }
1138                                                 else
1139                                                         t--; /* invalid */
1140                                         }
1141                                         else
1142                                                 t--; /* invalid */
1143                                 }
1144                                 else
1145                                         t--; /* invalid */
1146                         }
1147                         else if ((byte & 0xfc) == 0xf8) {
1148                                 /* invalid 5-byte */
1149                                 if (t + 4 > tlimit)
1150                                         return len + 1; /* invalid, stop here */
1151
1152                                 skip = 4;
1153                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1154                                         t++;
1155                         }
1156                         else if ((byte & 0xfe) == 0xfc) {
1157                                 /* invalid 6-byte */
1158                                 if (t + 5 > tlimit)
1159                                         return len + 1; /* invalid, stop here */
1160
1161                                 skip = 5;
1162                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1163                                         t++;
1164                         }
1165                         else
1166                                 ; /* invalid */
1167                 }
1168                 else {
1169                         /* NUL */
1170
1171                         if (byte == 0)
1172                                 break;
1173
1174                         /* ASCII character, common case */
1175                 }
1176
1177                 len++;
1178         }
1179
1180         return len;
1181 }
1182
1183
1184 /* utf8_safe_convert_to_u2s ****************************************************
1185
1186    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1187    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1188    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1189
1190    This function is safe even for invalid UTF-8 strings.
1191
1192    IN:
1193       text..........zero-terminated(!) UTF-8 string (may be invalid)
1194                         must NOT be NULL
1195           nbytes........strlen(text). (This is needed to completely emulate
1196                                         the RI).
1197           buffer........a preallocated array of u2s to receive the decoded
1198                         string. Use utf8_safe_number_of_u2s to get the
1199                                         required number of u2s for allocating this.
1200
1201 *******************************************************************************/
1202
1203 #define UNICODE_REPLACEMENT  0xfffd
1204
1205 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1206         register const unsigned char *t;
1207         register s4 byte;
1208         register const unsigned char *tlimit;
1209         s4 byte1;
1210         s4 byte2;
1211         s4 byte3;
1212         s4 value;
1213         s4 skip;
1214
1215         assert(text);
1216         assert(nbytes >= 0);
1217
1218         t = (const unsigned char *) text;
1219         tlimit = t + nbytes;
1220
1221         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1222
1223         while (1) {
1224                 byte = *t++;
1225
1226                 if (byte & 0x80) {
1227                         /* highest bit set, non-ASCII character */
1228
1229                         if ((byte & 0xe0) == 0xc0) {
1230                                 /* 2-byte: should be 110..... 10...... */
1231
1232                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1233                                         /* valid 2-byte UTF-8 */
1234                                         *buffer++ = ((byte  & 0x1f) << 6)
1235                                                           | ((byte1 & 0x3f)     );
1236                                 }
1237                                 else {
1238                                         *buffer++ = UNICODE_REPLACEMENT;
1239                                         t--;
1240                                 }
1241                         }
1242                         else if ((byte & 0xf0) == 0xe0) {
1243                                 /* 3-byte: should be 1110.... 10...... 10...... */
1244
1245                                 if (t + 2 > tlimit) {
1246                                         *buffer++ = UNICODE_REPLACEMENT;
1247                                         return;
1248                                 }
1249
1250                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1251                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1252                                                 /* valid 3-byte UTF-8 */
1253                                                 *buffer++ = ((byte  & 0x0f) << 12)
1254                                                                   | ((byte1 & 0x3f) <<  6)
1255                                                                   | ((byte2 & 0x3f)      );
1256                                         }
1257                                         else {
1258                                                 *buffer++ = UNICODE_REPLACEMENT;
1259                                                 t--;
1260                                         }
1261                                 }
1262                                 else {
1263                                         *buffer++ = UNICODE_REPLACEMENT;
1264                                         t--;
1265                                 }
1266                         }
1267                         else if ((byte & 0xf8) == 0xf0) {
1268                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1269
1270                                 if (t + 3 > tlimit) {
1271                                         *buffer++ = UNICODE_REPLACEMENT;
1272                                         return;
1273                                 }
1274
1275                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1276                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1277                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1278                                                         /* valid 4-byte UTF-8? */
1279                                                         value = ((byte  & 0x07) << 18)
1280                                                                   | ((byte1 & 0x3f) << 12)
1281                                                                   | ((byte2 & 0x3f) <<  6)
1282                                                                   | ((byte3 & 0x3f)      );
1283
1284                                                         if (value > 0x10FFFF) {
1285                                                                 *buffer++ = UNICODE_REPLACEMENT;
1286                                                         }
1287                                                         else if (value > 0xFFFF) {
1288                                                                 /* we need surrogates */
1289                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1290                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1291                                                         }
1292                                                         else
1293                                                                 *buffer++ = value; /* 16bit suffice */
1294                                                 }
1295                                                 else {
1296                                                         *buffer++ = UNICODE_REPLACEMENT;
1297                                                         t--;
1298                                                 }
1299                                         }
1300                                         else {
1301                                                 *buffer++ = UNICODE_REPLACEMENT;
1302                                                 t--;
1303                                         }
1304                                 }
1305                                 else {
1306                                         *buffer++ = UNICODE_REPLACEMENT;
1307                                         t--;
1308                                 }
1309                         }
1310                         else if ((byte & 0xfc) == 0xf8) {
1311                                 if (t + 4 > tlimit) {
1312                                         *buffer++ = UNICODE_REPLACEMENT;
1313                                         return;
1314                                 }
1315
1316                                 skip = 4;
1317                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1318                                         t++;
1319                                 *buffer++ = UNICODE_REPLACEMENT;
1320                         }
1321                         else if ((byte & 0xfe) == 0xfc) {
1322                                 if (t + 5 > tlimit) {
1323                                         *buffer++ = UNICODE_REPLACEMENT;
1324                                         return;
1325                                 }
1326
1327                                 skip = 5;
1328                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1329                                         t++;
1330                                 *buffer++ = UNICODE_REPLACEMENT;
1331                         }
1332                         else
1333                                 *buffer++ = UNICODE_REPLACEMENT;
1334                 }
1335                 else {
1336                         /* NUL */
1337
1338                         if (byte == 0)
1339                                 break;
1340
1341                         /* ASCII character, common case */
1342
1343                         *buffer++ = byte;
1344                 }
1345         }
1346 }
1347
1348
1349 /* u2_utflength ****************************************************************
1350
1351    Returns the utf length in bytes of a u2 array.
1352
1353 *******************************************************************************/
1354
1355 u4 u2_utflength(u2 *text, u4 u2_length)
1356 {
1357         u4 result_len = 0;                  /* utf length in bytes                */
1358         u2 ch;                              /* current unicode character          */
1359         u4 len;
1360         
1361         for (len = 0; len < u2_length; len++) {
1362                 /* next unicode character */
1363                 ch = *text++;
1364           
1365                 /* determine bytes required to store unicode character as utf */
1366                 if (ch && (ch < 0x80)) 
1367                         result_len++;
1368                 else if (ch < 0x800)
1369                         result_len += 2;        
1370                 else 
1371                         result_len += 3;        
1372         }
1373
1374     return result_len;
1375 }
1376
1377
1378 /* utf_copy ********************************************************************
1379
1380    Copy the given utf string byte-for-byte to a buffer.
1381
1382    IN:
1383       buffer.......the buffer
1384           u............the utf string
1385
1386 *******************************************************************************/
1387
1388 void utf_copy(char *buffer, utf *u)
1389 {
1390         /* our utf strings are zero-terminated (done by utf_new) */
1391         MCOPY(buffer, u->text, char, u->blength + 1);
1392 }
1393
1394
1395 /* utf_cat *********************************************************************
1396
1397    Append the given utf string byte-for-byte to a buffer.
1398
1399    IN:
1400       buffer.......the buffer
1401           u............the utf string
1402
1403 *******************************************************************************/
1404
1405 void utf_cat(char *buffer, utf *u)
1406 {
1407         /* our utf strings are zero-terminated (done by utf_new) */
1408         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1409 }
1410
1411
1412 /* utf_copy_classname **********************************************************
1413
1414    Copy the given utf classname byte-for-byte to a buffer.
1415    '/' is replaced by '.'
1416
1417    IN:
1418       buffer.......the buffer
1419           u............the utf string
1420
1421 *******************************************************************************/
1422
1423 void utf_copy_classname(char *buffer, utf *u)
1424 {
1425         char *bufptr;
1426         char *srcptr;
1427         char *endptr;
1428         char ch;
1429
1430         bufptr = buffer;
1431         srcptr = u->text;
1432         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1433
1434         while (srcptr != endptr) {
1435                 ch = *srcptr++;
1436                 if (ch == '/')
1437                         ch = '.';
1438                 *bufptr++ = ch;
1439         }
1440 }
1441
1442
1443 /* utf_cat *********************************************************************
1444
1445    Append the given utf classname byte-for-byte to a buffer.
1446    '/' is replaced by '.'
1447
1448    IN:
1449       buffer.......the buffer
1450           u............the utf string
1451
1452 *******************************************************************************/
1453
1454 void utf_cat_classname(char *buffer, utf *u)
1455 {
1456         utf_copy_classname(buffer + strlen(buffer), u);
1457 }
1458
1459 /* utf_display_printable_ascii *************************************************
1460
1461    Write utf symbol to stdout (for debugging purposes).
1462    Non-printable and non-ASCII characters are printed as '?'.
1463
1464 *******************************************************************************/
1465
1466 void utf_display_printable_ascii(utf *u)
1467 {
1468         char *endpos;                       /* points behind utf string           */
1469         char *utf_ptr;                      /* current position in utf text       */
1470
1471         if (u == NULL) {
1472                 printf("NULL");
1473                 fflush(stdout);
1474                 return;
1475         }
1476
1477         endpos = UTF_END(u);
1478         utf_ptr = u->text;
1479
1480         while (utf_ptr < endpos) {
1481                 /* read next unicode character */
1482
1483                 u2 c = utf_nextu2(&utf_ptr);
1484
1485                 if ((c >= 32) && (c <= 127))
1486                         printf("%c", c);
1487                 else
1488                         printf("?");
1489         }
1490
1491         fflush(stdout);
1492 }
1493
1494
1495 /* utf_display_printable_ascii_classname ***************************************
1496
1497    Write utf symbol to stdout with `/' converted to `.' (for debugging
1498    purposes).
1499    Non-printable and non-ASCII characters are printed as '?'.
1500
1501 *******************************************************************************/
1502
1503 void utf_display_printable_ascii_classname(utf *u)
1504 {
1505         char *endpos;                       /* points behind utf string           */
1506         char *utf_ptr;                      /* current position in utf text       */
1507
1508         if (u == NULL) {
1509                 printf("NULL");
1510                 fflush(stdout);
1511                 return;
1512         }
1513
1514         endpos = UTF_END(u);
1515         utf_ptr = u->text;
1516
1517         while (utf_ptr < endpos) {
1518                 /* read next unicode character */
1519
1520                 u2 c = utf_nextu2(&utf_ptr);
1521
1522                 if (c == '/')
1523                         c = '.';
1524
1525                 if ((c >= 32) && (c <= 127))
1526                         printf("%c", c);
1527                 else
1528                         printf("?");
1529         }
1530
1531         fflush(stdout);
1532 }
1533
1534
1535 /* utf_sprint_convert_to_latin1 ************************************************
1536         
1537    Write utf symbol into c-string (for debugging purposes).
1538    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1539    invalid results.
1540
1541 *******************************************************************************/
1542
1543 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1544 {
1545         char *endpos;                       /* points behind utf string           */
1546         char *utf_ptr;                      /* current position in utf text       */
1547         u2 pos = 0;                         /* position in c-string               */
1548
1549         if (!u) {
1550                 strcpy(buffer, "NULL");
1551                 return;
1552         }
1553
1554         endpos = UTF_END(u);
1555         utf_ptr = u->text;
1556
1557         while (utf_ptr < endpos) 
1558                 /* copy next unicode character */       
1559                 buffer[pos++] = utf_nextu2(&utf_ptr);
1560
1561         /* terminate string */
1562         buffer[pos] = '\0';
1563 }
1564
1565
1566 /* utf_sprint_convert_to_latin1_classname **************************************
1567         
1568    Write utf symbol into c-string with `/' converted to `.' (for debugging
1569    purposes).
1570    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1571    invalid results.
1572
1573 *******************************************************************************/
1574
1575 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1576 {
1577         char *endpos;                       /* points behind utf string           */
1578         char *utf_ptr;                      /* current position in utf text       */
1579         u2 pos = 0;                         /* position in c-string               */
1580
1581         if (!u) {
1582                 strcpy(buffer, "NULL");
1583                 return;
1584         }
1585
1586         endpos = UTF_END(u);
1587         utf_ptr = u->text;
1588
1589         while (utf_ptr < endpos) {
1590                 /* copy next unicode character */       
1591                 u2 c = utf_nextu2(&utf_ptr);
1592                 if (c == '/') c = '.';
1593                 buffer[pos++] = c;
1594         }
1595
1596         /* terminate string */
1597         buffer[pos] = '\0';
1598 }
1599
1600
1601 /* utf_strcat_convert_to_latin1 ************************************************
1602         
1603    Like libc strcat, but uses an utf8 string.
1604    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1605    invalid results.
1606
1607 *******************************************************************************/
1608
1609 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1610 {
1611         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1612 }
1613
1614
1615 /* utf_strcat_convert_to_latin1_classname **************************************
1616         
1617    Like libc strcat, but uses an utf8 string.
1618    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1619    invalid results.
1620
1621 *******************************************************************************/
1622
1623 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1624 {
1625         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1626 }
1627
1628
1629 /* utf_fprint_printable_ascii **************************************************
1630         
1631    Write utf symbol into file.
1632    Non-printable and non-ASCII characters are printed as '?'.
1633
1634 *******************************************************************************/
1635
1636 void utf_fprint_printable_ascii(FILE *file, utf *u)
1637 {
1638         char *endpos;                       /* points behind utf string           */
1639         char *utf_ptr;                      /* current position in utf text       */
1640
1641         if (!u)
1642                 return;
1643
1644         endpos = UTF_END(u);
1645         utf_ptr = u->text;
1646
1647         while (utf_ptr < endpos) { 
1648                 /* read next unicode character */                
1649                 u2 c = utf_nextu2(&utf_ptr);                            
1650
1651                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1652                 else fprintf(file, "?");
1653         }
1654 }
1655
1656
1657 /* utf_fprint_printable_ascii_classname ****************************************
1658         
1659    Write utf symbol into file with `/' converted to `.'.
1660    Non-printable and non-ASCII characters are printed as '?'.
1661
1662 *******************************************************************************/
1663
1664 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1665 {
1666         char *endpos;                       /* points behind utf string           */
1667         char *utf_ptr;                      /* current position in utf text       */
1668
1669     if (!u)
1670                 return;
1671
1672         endpos = UTF_END(u);
1673         utf_ptr = u->text;
1674
1675         while (utf_ptr < endpos) { 
1676                 /* read next unicode character */                
1677                 u2 c = utf_nextu2(&utf_ptr);                            
1678                 if (c == '/') c = '.';
1679
1680                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1681                 else fprintf(file, "?");
1682         }
1683 }
1684
1685
1686 /* is_valid_utf ****************************************************************
1687
1688    Return true if the given string is a valid UTF-8 string.
1689
1690    utf_ptr...points to first character
1691    end_pos...points after last character
1692
1693 *******************************************************************************/
1694
1695 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1696
1697 bool is_valid_utf(char *utf_ptr, char *end_pos)
1698 {
1699         int bytes;
1700         int len,i;
1701         char c;
1702         unsigned long v;
1703
1704         if (end_pos < utf_ptr) return false;
1705         bytes = end_pos - utf_ptr;
1706         while (bytes--) {
1707                 c = *utf_ptr++;
1708
1709                 if (!c) return false;                     /* 0x00 is not allowed */
1710                 if ((c & 0x80) == 0) continue;            /* ASCII */
1711
1712                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1713                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1714                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1715                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1716                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1717                 else return false;                        /* invalid leading byte */
1718
1719                 if (len > 2) return false;                /* Java limitation */
1720
1721                 v = (unsigned long)c & (0x3f >> len);
1722                 
1723                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1724
1725                 for (i = len; i--; ) {
1726                         c = *utf_ptr++;
1727                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1728                                 return false;
1729                         v = (v << 6) | (c & 0x3f);
1730                 }
1731
1732                 if (v == 0) {
1733                         if (len != 1) return false;           /* Java special */
1734
1735                 } else {
1736                         /* Sun Java seems to allow overlong UTF-8 encodings */
1737                         
1738                         /* if (v < min_codepoint[len]) */
1739                                 /* XXX throw exception? */
1740                 }
1741
1742                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1743                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1744
1745                 /* even these seem to be allowed */
1746                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1747         }
1748
1749         return true;
1750 }
1751
1752
1753 /* is_valid_name ***************************************************************
1754
1755    Return true if the given string may be used as a class/field/method
1756    name. (Currently this only disallows empty strings and control
1757    characters.)
1758
1759    NOTE: The string is assumed to have passed is_valid_utf!
1760
1761    utf_ptr...points to first character
1762    end_pos...points after last character
1763
1764 *******************************************************************************/
1765
1766 bool is_valid_name(char *utf_ptr, char *end_pos)
1767 {
1768         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1769
1770         while (utf_ptr < end_pos) {
1771                 unsigned char c = *utf_ptr++;
1772
1773                 if (c < 0x20) return false; /* disallow control characters */
1774                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1775                         return false;
1776         }
1777
1778         return true;
1779 }
1780
1781 bool is_valid_name_utf(utf *u)
1782 {
1783         return is_valid_name(u->text, UTF_END(u));
1784 }
1785
1786
1787 /* utf_show ********************************************************************
1788
1789    Writes the utf symbols in the utfhash to stdout and displays the
1790    number of external hash chains grouped according to the chainlength
1791    (for debugging purposes).
1792
1793 *******************************************************************************/
1794
1795 #if !defined(NDEBUG)
1796 void utf_show(void)
1797 {
1798
1799 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1800
1801         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1802         u4 max_chainlength = 0;      /* maximum length of the chains */
1803         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1804         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1805         u4 i;
1806
1807         printf("UTF-HASH:\n");
1808
1809         /* show element of utf-hashtable */
1810
1811         for (i = 0; i < hashtable_utf->size; i++) {
1812                 utf *u = hashtable_utf->ptr[i];
1813
1814                 if (u) {
1815                         printf("SLOT %d: ", (int) i);
1816
1817                         while (u) {
1818                                 printf("'");
1819                                 utf_display_printable_ascii(u);
1820                                 printf("' ");
1821                                 u = u->hashlink;
1822                         }       
1823                         printf("\n");
1824                 }
1825         }
1826
1827         printf("UTF-HASH: %d slots for %d entries\n", 
1828                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1829
1830         if (hashtable_utf->entries == 0)
1831                 return;
1832
1833         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1834
1835         for (i=0;i<CHAIN_LIMIT;i++)
1836                 chain_count[i]=0;
1837
1838         /* count numbers of hashchains according to their length */
1839         for (i=0; i<hashtable_utf->size; i++) {
1840                   
1841                 utf *u = (utf*) hashtable_utf->ptr[i];
1842                 u4 chain_length = 0;
1843
1844                 /* determine chainlength */
1845                 while (u) {
1846                         u = u->hashlink;
1847                         chain_length++;
1848                 }
1849
1850                 /* update sum of all chainlengths */
1851                 sum_chainlength+=chain_length;
1852
1853                 /* determine the maximum length of the chains */
1854                 if (chain_length>max_chainlength)
1855                         max_chainlength = chain_length;
1856
1857                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1858                 if (chain_length>=CHAIN_LIMIT) {
1859                         beyond_limit+=chain_length;
1860                         chain_length=CHAIN_LIMIT-1;
1861                 }
1862
1863                 /* update number of hashchains of current length */
1864                 chain_count[chain_length]++;
1865         }
1866
1867         /* display results */  
1868         for (i=1;i<CHAIN_LIMIT-1;i++) 
1869                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1870           
1871         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1872
1873
1874         printf("max. chainlength:%5d\n",max_chainlength);
1875
1876         /* avg. chainlength = sum of chainlengths / number of chains */
1877         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1878 }
1879 #endif /* !defined(NDEBUG) */
1880
1881
1882 /*
1883  * These are local overrides for various environment variables in Emacs.
1884  * Please do not remove this and leave it at the end of the file, where
1885  * Emacs will automagically detect them.
1886  * ---------------------------------------------------------------------
1887  * Local variables:
1888  * mode: c
1889  * indent-tabs-mode: t
1890  * c-basic-offset: 4
1891  * tab-width: 4
1892  * End:
1893  * vim:noexpandtab:sw=4:ts=4:
1894  */