3251e5924cf80de20bb7acf014205ec124f1b3e3
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    $Id: utf8.c 7716 2007-04-16 14:29:53Z twisti $
26
27 */
28
29
30 #include "config.h"
31
32 #include <string.h>
33 #include <assert.h>
34
35 #include "vm/types.h"
36
37 #include "mm/memory.h"
38
39 #if defined(ENABLE_THREADS)
40 # include "threads/native/lock.h"
41 #else
42 # include "threads/none/lock.h"
43 #endif
44
45 #include "toolbox/hashtable.h"
46
47 #include "vm/exceptions.h"
48
49 #include "vmcore/options.h"
50
51 #if defined(ENABLE_STATISTICS)
52 # include "vmcore/statistics.h"
53 #endif
54
55 #include "vmcore/utf8.h"
56
57
58 /* global variables ***********************************************************/
59
60 /* hashsize must be power of 2 */
61
62 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
63
64 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
65
66
67 /* utf-symbols for pointer comparison of frequently used strings **************/
68
69 utf *utf_java_lang_Object;
70
71 utf *utf_java_lang_Class;
72 utf *utf_java_lang_ClassLoader;
73 utf *utf_java_lang_Cloneable;
74 utf *utf_java_lang_SecurityManager;
75 utf *utf_java_lang_String;
76 utf *utf_java_lang_System;
77 utf *utf_java_lang_ThreadGroup;
78 utf *utf_java_lang_ref_SoftReference;
79 utf *utf_java_lang_ref_WeakReference;
80 utf *utf_java_lang_ref_PhantomReference;
81 utf *utf_java_io_Serializable;
82
83 utf *utf_java_lang_Throwable;
84 utf *utf_java_lang_Error;
85
86 utf *utf_java_lang_AbstractMethodError;
87 utf *utf_java_lang_ClassCircularityError;
88 utf *utf_java_lang_ClassFormatError;
89 utf *utf_java_lang_ExceptionInInitializerError;
90 utf *utf_java_lang_IncompatibleClassChangeError;
91 utf *utf_java_lang_InstantiationError;
92 utf *utf_java_lang_InternalError;
93 utf *utf_java_lang_LinkageError;
94 utf *utf_java_lang_NoClassDefFoundError;
95 utf *utf_java_lang_NoSuchFieldError;
96 utf *utf_java_lang_NoSuchMethodError;
97 utf *utf_java_lang_OutOfMemoryError;
98 utf *utf_java_lang_UnsatisfiedLinkError;
99 utf *utf_java_lang_UnsupportedClassVersionError;
100 utf *utf_java_lang_VerifyError;
101 utf *utf_java_lang_VirtualMachineError;
102
103 #if defined(WITH_CLASSPATH_GNU)
104 utf *utf_java_lang_VMThrowable;
105 #endif
106
107 utf *utf_java_lang_Exception;
108
109 utf *utf_java_lang_ArithmeticException;
110 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
111 utf *utf_java_lang_ArrayStoreException;
112 utf *utf_java_lang_ClassCastException;
113 utf *utf_java_lang_ClassNotFoundException;
114 utf *utf_java_lang_CloneNotSupportedException;
115 utf *utf_java_lang_IllegalAccessException;
116 utf *utf_java_lang_IllegalArgumentException;
117 utf *utf_java_lang_IllegalMonitorStateException;
118 utf *utf_java_lang_InstantiationException;
119 utf *utf_java_lang_InterruptedException;
120 utf *utf_java_lang_NegativeArraySizeException;
121 utf *utf_java_lang_NullPointerException;
122 utf *utf_java_lang_StringIndexOutOfBoundsException;
123
124 utf *utf_java_lang_reflect_InvocationTargetException;
125
126 #if defined(ENABLE_JAVASE)
127 utf* utf_java_lang_Void;
128 #endif
129
130 utf* utf_java_lang_Boolean;
131 utf* utf_java_lang_Byte;
132 utf* utf_java_lang_Character;
133 utf* utf_java_lang_Short;
134 utf* utf_java_lang_Integer;
135 utf* utf_java_lang_Long;
136 utf* utf_java_lang_Float;
137 utf* utf_java_lang_Double;
138
139 #if defined(ENABLE_JAVASE)
140 utf *utf_java_lang_StackTraceElement;
141 utf *utf_java_lang_reflect_Constructor;
142 utf *utf_java_lang_reflect_Field;
143 utf *utf_java_lang_reflect_Method;
144 utf *utf_java_util_Vector;
145 #endif
146
147 utf *utf_InnerClasses;                  /* InnerClasses                       */
148 utf *utf_ConstantValue;                 /* ConstantValue                      */
149 utf *utf_Code;                          /* Code                               */
150 utf *utf_Exceptions;                    /* Exceptions                         */
151 utf *utf_LineNumberTable;               /* LineNumberTable                    */
152 utf *utf_SourceFile;                    /* SourceFile                         */
153
154 #if defined(ENABLE_JAVASE)
155 utf *utf_EnclosingMethod;
156 utf *utf_Signature;
157 utf *utf_RuntimeVisibleAnnotations;
158 utf *utf_StackMapTable;
159 #endif
160
161 utf *utf_init;                          /* <init>                             */
162 utf *utf_clinit;                        /* <clinit>                           */
163 utf *utf_clone;                         /* clone                              */
164 utf *utf_finalize;                      /* finalize                           */
165 utf *utf_run;                           /* run                                */
166
167 utf *utf_add;
168 utf *utf_remove;
169 utf *utf_addThread;
170 utf *utf_removeThread;
171 utf *utf_put;
172 utf *utf_get;
173 utf *utf_value;
174
175 utf *utf_fillInStackTrace;
176 utf *utf_getSystemClassLoader;
177 utf *utf_loadClass;
178 utf *utf_printStackTrace;
179
180 utf *utf_Z;                             /* Z                                  */
181 utf *utf_B;                             /* B                                  */
182 utf *utf_C;                             /* C                                  */
183 utf *utf_S;                             /* S                                  */
184 utf *utf_I;                             /* I                                  */
185 utf *utf_J;                             /* J                                  */
186 utf *utf_F;                             /* F                                  */
187 utf *utf_D;                             /* D                                  */
188
189 utf *utf_void__void;                    /* ()V                                */
190 utf *utf_boolean__void;                 /* (Z)V                               */
191 utf *utf_byte__void;                    /* (B)V                               */
192 utf *utf_char__void;                    /* (C)V                               */
193 utf *utf_short__void;                   /* (S)V                               */
194 utf *utf_int__void;                     /* (I)V                               */
195 utf *utf_long__void;                    /* (J)V                               */
196 utf *utf_float__void;                   /* (F)V                               */
197 utf *utf_double__void;                  /* (D)V                               */
198
199 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
200 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
201 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
202 utf *utf_java_lang_Object__java_lang_Object;
203 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
204 utf *utf_java_lang_String__java_lang_Class;
205 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
206 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
207
208 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
209 utf *utf_null;
210 utf *array_packagename;
211
212
213 /* utf_init ********************************************************************
214
215    Initializes the utf8 subsystem.
216
217 *******************************************************************************/
218
219 bool utf8_init(void)
220 {
221         /* create utf8 hashtable */
222
223         hashtable_utf = NEW(hashtable);
224
225         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
226
227 #if defined(ENABLE_STATISTICS)
228         if (opt_stat)
229                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
230 #endif
231
232         /* create utf-symbols for pointer comparison of frequently used strings */
233
234         utf_java_lang_Object           = utf_new_char("java/lang/Object");
235
236         utf_java_lang_Class            = utf_new_char("java/lang/Class");
237         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
238         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
239         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
240         utf_java_lang_String           = utf_new_char("java/lang/String");
241         utf_java_lang_System           = utf_new_char("java/lang/System");
242         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
243
244         utf_java_lang_ref_SoftReference =
245                 utf_new_char("java/lang/ref/SoftReference");
246
247         utf_java_lang_ref_WeakReference =
248                 utf_new_char("java/lang/ref/WeakReference");
249
250         utf_java_lang_ref_PhantomReference =
251                 utf_new_char("java/lang/ref/PhantomReference");
252
253         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
254
255         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
256         utf_java_lang_Error            = utf_new_char("java/lang/Error");
257
258         utf_java_lang_ClassCircularityError =
259                 utf_new_char("java/lang/ClassCircularityError");
260
261         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
262
263         utf_java_lang_ExceptionInInitializerError =
264                 utf_new_char("java/lang/ExceptionInInitializerError");
265
266         utf_java_lang_IncompatibleClassChangeError =
267                 utf_new_char("java/lang/IncompatibleClassChangeError");
268
269         utf_java_lang_InstantiationError =
270                 utf_new_char("java/lang/InstantiationError");
271
272         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
273         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
274
275         utf_java_lang_NoClassDefFoundError =
276                 utf_new_char("java/lang/NoClassDefFoundError");
277
278         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
279
280         utf_java_lang_UnsatisfiedLinkError =
281                 utf_new_char("java/lang/UnsatisfiedLinkError");
282
283         utf_java_lang_UnsupportedClassVersionError =
284                 utf_new_char("java/lang/UnsupportedClassVersionError");
285
286         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
287
288         utf_java_lang_VirtualMachineError =
289                 utf_new_char("java/lang/VirtualMachineError");
290
291 #if defined(ENABLE_JAVASE)
292         utf_java_lang_AbstractMethodError =
293                 utf_new_char("java/lang/AbstractMethodError");
294
295         utf_java_lang_NoSuchFieldError =
296                 utf_new_char("java/lang/NoSuchFieldError");
297
298         utf_java_lang_NoSuchMethodError =
299                 utf_new_char("java/lang/NoSuchMethodError");
300 #endif
301
302 #if defined(WITH_CLASSPATH_GNU)
303         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
304 #endif
305
306         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
307
308         utf_java_lang_ArithmeticException =
309                 utf_new_char("java/lang/ArithmeticException");
310
311         utf_java_lang_ArrayIndexOutOfBoundsException =
312                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
313
314         utf_java_lang_ArrayStoreException =
315                 utf_new_char("java/lang/ArrayStoreException");
316
317         utf_java_lang_ClassCastException =
318                 utf_new_char("java/lang/ClassCastException");
319
320         utf_java_lang_ClassNotFoundException =
321                 utf_new_char("java/lang/ClassNotFoundException");
322
323         utf_java_lang_CloneNotSupportedException =
324                 utf_new_char("java/lang/CloneNotSupportedException");
325
326         utf_java_lang_IllegalAccessException =
327                 utf_new_char("java/lang/IllegalAccessException");
328
329         utf_java_lang_IllegalArgumentException =
330                 utf_new_char("java/lang/IllegalArgumentException");
331
332         utf_java_lang_IllegalMonitorStateException =
333                 utf_new_char("java/lang/IllegalMonitorStateException");
334
335         utf_java_lang_InstantiationException =
336                 utf_new_char("java/lang/InstantiationException");
337
338         utf_java_lang_InterruptedException =
339                 utf_new_char("java/lang/InterruptedException");
340
341         utf_java_lang_NegativeArraySizeException =
342                 utf_new_char("java/lang/NegativeArraySizeException");
343
344         utf_java_lang_NullPointerException =
345                 utf_new_char("java/lang/NullPointerException");
346
347         utf_java_lang_StringIndexOutOfBoundsException =
348                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
349
350         utf_java_lang_reflect_InvocationTargetException =
351                 utf_new_char("java/lang/reflect/InvocationTargetException");
352  
353 #if defined(ENABLE_JAVASE)
354         utf_java_lang_Void             = utf_new_char("java/lang/Void");
355 #endif
356
357         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
358         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
359         utf_java_lang_Character        = utf_new_char("java/lang/Character");
360         utf_java_lang_Short            = utf_new_char("java/lang/Short");
361         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
362         utf_java_lang_Long             = utf_new_char("java/lang/Long");
363         utf_java_lang_Float            = utf_new_char("java/lang/Float");
364         utf_java_lang_Double           = utf_new_char("java/lang/Double");
365
366 #if defined(ENABLE_JAVASE)
367         utf_java_lang_StackTraceElement =
368                 utf_new_char("java/lang/StackTraceElement");
369
370         utf_java_lang_reflect_Constructor =
371                 utf_new_char("java/lang/reflect/Constructor");
372
373         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
374         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
375         utf_java_util_Vector           = utf_new_char("java/util/Vector");
376 #endif
377
378         utf_InnerClasses               = utf_new_char("InnerClasses");
379         utf_ConstantValue              = utf_new_char("ConstantValue");
380         utf_Code                       = utf_new_char("Code");
381         utf_Exceptions                 = utf_new_char("Exceptions");
382         utf_LineNumberTable            = utf_new_char("LineNumberTable");
383         utf_SourceFile                 = utf_new_char("SourceFile");
384
385 #if defined(ENABLE_JAVASE)
386         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
387         utf_Signature                  = utf_new_char("Signature");
388         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
389         utf_StackMapTable              = utf_new_char("StackMapTable");
390 #endif
391
392         utf_init                           = utf_new_char("<init>");
393         utf_clinit                         = utf_new_char("<clinit>");
394         utf_clone                      = utf_new_char("clone");
395         utf_finalize                   = utf_new_char("finalize");
396         utf_run                        = utf_new_char("run");
397
398         utf_add                        = utf_new_char("add");
399         utf_remove                     = utf_new_char("remove");
400         utf_addThread                  = utf_new_char("addThread");
401         utf_removeThread               = utf_new_char("removeThread");
402         utf_put                        = utf_new_char("put");
403         utf_get                        = utf_new_char("get");
404         utf_value                      = utf_new_char("value");
405
406         utf_printStackTrace            = utf_new_char("printStackTrace");
407         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
408         utf_loadClass                  = utf_new_char("loadClass");
409         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
410
411         utf_Z                          = utf_new_char("Z");
412         utf_B                          = utf_new_char("B");
413         utf_C                          = utf_new_char("C");
414         utf_S                          = utf_new_char("S");
415         utf_I                          = utf_new_char("I");
416         utf_J                          = utf_new_char("J");
417         utf_F                          = utf_new_char("F");
418         utf_D                          = utf_new_char("D");
419
420         utf_void__void                 = utf_new_char("()V");
421         utf_boolean__void              = utf_new_char("(Z)V");
422         utf_byte__void                 = utf_new_char("(B)V");
423         utf_char__void                 = utf_new_char("(C)V");
424         utf_short__void                = utf_new_char("(S)V");
425         utf_int__void                  = utf_new_char("(I)V");
426         utf_long__void                 = utf_new_char("(J)V");
427         utf_float__void                = utf_new_char("(F)V");
428         utf_double__void               = utf_new_char("(D)V");
429         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
430         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
431
432         utf_void__java_lang_ClassLoader =
433                 utf_new_char("()Ljava/lang/ClassLoader;");
434
435         utf_java_lang_Object__java_lang_Object =
436                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
437
438         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
439
440         utf_java_lang_String__java_lang_Class =
441                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
442
443         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
444         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
445
446         utf_null                       = utf_new_char("null");
447         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
448         array_packagename              = utf_new_char("\t<the array package>");
449
450         /* everything's ok */
451
452         return true;
453 }
454
455
456 /* utf_hashkey *****************************************************************
457
458    The hashkey is computed from the utf-text by using up to 8
459    characters.  For utf-symbols longer than 15 characters 3 characters
460    are taken from the beginning and the end, 2 characters are taken
461    from the middle.
462
463 *******************************************************************************/
464
465 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
466 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
467
468 u4 utf_hashkey(const char *text, u4 length)
469 {
470         const char *start_pos = text;       /* pointer to utf text                */
471         u4 a;
472
473         switch (length) {
474         case 0: /* empty string */
475                 return 0;
476
477         case 1: return fbs(0);
478         case 2: return fbs(0) ^ nbs(3);
479         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
480         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
481         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
482         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
483         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
484         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
485
486         case 9:
487                 a = fbs(0);
488                 a ^= nbs(1);
489                 a ^= nbs(2);
490                 text++;
491                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
492
493         case 10:
494                 a = fbs(0);
495                 text++;
496                 a ^= nbs(2);
497                 a ^= nbs(3);
498                 a ^= nbs(4);
499                 text++;
500                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
501
502         case 11:
503                 a = fbs(0);
504                 text++;
505                 a ^= nbs(2);
506                 a ^= nbs(3);
507                 a ^= nbs(4);
508                 text++;
509                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
510
511         case 12:
512                 a = fbs(0);
513                 text += 2;
514                 a ^= nbs(2);
515                 a ^= nbs(3);
516                 text++;
517                 a ^= nbs(5);
518                 a ^= nbs(6);
519                 a ^= nbs(7);
520                 text++;
521                 return a ^ nbs(9) ^ nbs(10);
522
523         case 13:
524                 a = fbs(0);
525                 a ^= nbs(1);
526                 text++;
527                 a ^= nbs(3);
528                 a ^= nbs(4);
529                 text += 2;      
530                 a ^= nbs(7);
531                 a ^= nbs(8);
532                 text += 2;
533                 return a ^ nbs(9) ^ nbs(10);
534
535         case 14:
536                 a = fbs(0);
537                 text += 2;      
538                 a ^= nbs(3);
539                 a ^= nbs(4);
540                 text += 2;      
541                 a ^= nbs(7);
542                 a ^= nbs(8);
543                 text += 2;
544                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
545
546         case 15:
547                 a = fbs(0);
548                 text += 2;      
549                 a ^= nbs(3);
550                 a ^= nbs(4);
551                 text += 2;      
552                 a ^= nbs(7);
553                 a ^= nbs(8);
554                 text += 2;
555                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
556
557         default:  /* 3 characters from beginning */
558                 a = fbs(0);
559                 text += 2;
560                 a ^= nbs(3);
561                 a ^= nbs(4);
562
563                 /* 2 characters from middle */
564                 text = start_pos + (length / 2);
565                 a ^= fbs(5);
566                 text += 2;
567                 a ^= nbs(6);    
568
569                 /* 3 characters from end */
570                 text = start_pos + length - 4;
571
572                 a ^= fbs(7);
573                 text++;
574
575                 return a ^ nbs(10) ^ nbs(11);
576     }
577 }
578
579 /* utf_full_hashkey ************************************************************
580
581    This function computes a hash value using all bytes in the string.
582
583    The algorithm is the "One-at-a-time" algorithm as published
584    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
585
586 *******************************************************************************/
587
588 u4 utf_full_hashkey(const char *text, u4 length)
589 {
590         register const unsigned char *p = (const unsigned char *) text;
591         register u4 hash;
592         register u4 i;
593
594         hash = 0;
595         for (i=length; i--;)
596         {
597             hash += *p++;
598             hash += (hash << 10);
599             hash ^= (hash >> 6);
600         }
601         hash += (hash << 3);
602         hash ^= (hash >> 11);
603         hash += (hash << 15);
604
605         return hash;
606 }
607
608 /* unicode_hashkey *************************************************************
609
610    Compute the hashkey of a unicode string.
611
612 *******************************************************************************/
613
614 u4 unicode_hashkey(u2 *text, u2 len)
615 {
616         return utf_hashkey((char *) text, len);
617 }
618
619
620 /* utf_new *********************************************************************
621
622    Creates a new utf-symbol, the text of the symbol is passed as a
623    u1-array. The function searches the utf-hashtable for a utf-symbol
624    with this text. On success the element returned, otherwise a new
625    hashtable element is created.
626
627    If the number of entries in the hashtable exceeds twice the size of
628    the hashtable slots a reorganization of the hashtable is done and
629    the utf symbols are copied to a new hashtable with doubled size.
630
631 *******************************************************************************/
632
633 utf *utf_new(const char *text, u2 length)
634 {
635         u4 key;                             /* hashkey computed from utf-text     */
636         u4 slot;                            /* slot in hashtable                  */
637         utf *u;                             /* hashtable element                  */
638         u2 i;
639
640         LOCK_MONITOR_ENTER(hashtable_utf->header);
641
642 #if defined(ENABLE_STATISTICS)
643         if (opt_stat)
644                 count_utf_new++;
645 #endif
646
647         key  = utf_hashkey(text, length);
648         slot = key & (hashtable_utf->size - 1);
649         u    = hashtable_utf->ptr[slot];
650
651         /* search external hash chain for utf-symbol */
652
653         while (u) {
654                 if (u->blength == length) {
655                         /* compare text of hashtable elements */
656
657                         for (i = 0; i < length; i++)
658                                 if (text[i] != u->text[i])
659                                         goto nomatch;
660                         
661 #if defined(ENABLE_STATISTICS)
662                         if (opt_stat)
663                                 count_utf_new_found++;
664 #endif
665
666                         /* symbol found in hashtable */
667
668                         LOCK_MONITOR_EXIT(hashtable_utf->header);
669
670                         return u;
671                 }
672
673         nomatch:
674                 u = u->hashlink; /* next element in external chain */
675         }
676
677         /* location in hashtable found, create new utf element */
678
679         u = NEW(utf);
680
681         u->blength  = length;               /* length in bytes of utfstring       */
682         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
683         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
684
685         memcpy(u->text, text, length);      /* copy utf-text                      */
686         u->text[length] = '\0';
687
688 #if defined(ENABLE_STATISTICS)
689         if (opt_stat)
690                 count_utf_len += sizeof(utf) + length + 1;
691 #endif
692
693         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
694         hashtable_utf->entries++;           /* update number of entries           */
695
696         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
697
698         /* reorganization of hashtable, average length of the external
699            chains is approx. 2 */
700
701                 hashtable *newhash;                              /* the new hashtable */
702                 u4         i;
703                 utf       *u;
704                 utf       *nextu;
705                 u4         slot;
706
707                 /* create new hashtable, double the size */
708
709                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
710
711 #if defined(ENABLE_STATISTICS)
712                 if (opt_stat)
713                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
714 #endif
715
716                 /* transfer elements to new hashtable */
717
718                 for (i = 0; i < hashtable_utf->size; i++) {
719                         u = hashtable_utf->ptr[i];
720
721                         while (u) {
722                                 nextu = u->hashlink;
723                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
724                                                 
725                                 u->hashlink = (utf *) newhash->ptr[slot];
726                                 newhash->ptr[slot] = u;
727
728                                 /* follow link in external hash chain */
729
730                                 u = nextu;
731                         }
732                 }
733         
734                 /* dispose old table */
735
736                 hashtable_free(hashtable_utf);
737
738                 hashtable_utf = newhash;
739         }
740
741         LOCK_MONITOR_EXIT(hashtable_utf->header);
742
743         return u;
744 }
745
746
747 /* utf_new_u2 ******************************************************************
748
749    Make utf symbol from u2 array, if isclassname is true '.' is
750    replaced by '/'.
751
752 *******************************************************************************/
753
754 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
755 {
756         char *buffer;                   /* memory buffer for  unicode characters  */
757         char *pos;                      /* pointer to current position in buffer  */
758         u4 left;                        /* unicode characters left                */
759         u4 buflength;                   /* utf length in bytes of the u2 array    */
760         utf *result;                    /* resulting utf-string                   */
761         int i;          
762
763         /* determine utf length in bytes and allocate memory */
764
765         buflength = u2_utflength(unicode_pos, unicode_length); 
766         buffer    = MNEW(char, buflength);
767  
768         left = buflength;
769         pos  = buffer;
770
771         for (i = 0; i++ < unicode_length; unicode_pos++) {
772                 /* next unicode character */
773                 u2 c = *unicode_pos;
774                 
775                 if ((c != 0) && (c < 0x80)) {
776                         /* 1 character */       
777                         left--;
778                 if ((int) left < 0) break;
779                         /* convert classname */
780                         if (isclassname && c == '.')
781                                 *pos++ = '/';
782                         else
783                                 *pos++ = (char) c;
784
785                 } else if (c < 0x800) {             
786                         /* 2 characters */                              
787                 unsigned char high = c >> 6;
788                 unsigned char low  = c & 0x3F;
789                         left = left - 2;
790                 if ((int) left < 0) break;
791                 *pos++ = high | 0xC0; 
792                 *pos++ = low  | 0x80;     
793
794                 } else {         
795                 /* 3 characters */                              
796                 char low  = c & 0x3f;
797                 char mid  = (c >> 6) & 0x3F;
798                 char high = c >> 12;
799                         left = left - 3;
800                 if ((int) left < 0) break;
801                 *pos++ = high | 0xE0; 
802                 *pos++ = mid  | 0x80;  
803                 *pos++ = low  | 0x80;   
804                 }
805         }
806         
807         /* insert utf-string into symbol-table */
808         result = utf_new(buffer,buflength);
809
810         MFREE(buffer, char, buflength);
811
812         return result;
813 }
814
815
816 /* utf_new_char ****************************************************************
817
818    Creates a new utf symbol, the text for this symbol is passed as a
819    c-string ( = char* ).
820
821 *******************************************************************************/
822
823 utf *utf_new_char(const char *text)
824 {
825         return utf_new(text, strlen(text));
826 }
827
828
829 /* utf_new_char_classname ******************************************************
830
831    Creates a new utf symbol, the text for this symbol is passed as a
832    c-string ( = char* ) "." characters are going to be replaced by
833    "/". Since the above function is used often, this is a separte
834    function, instead of an if.
835
836 *******************************************************************************/
837
838 utf *utf_new_char_classname(const char *text)
839 {
840         if (strchr(text, '.')) {
841                 char *txt = strdup(text);
842                 char *end = txt + strlen(txt);
843                 char *c;
844                 utf *tmpRes;
845
846                 for (c = txt; c < end; c++)
847                         if (*c == '.') *c = '/';
848
849                 tmpRes = utf_new(txt, strlen(txt));
850                 FREE(txt, 0);
851
852                 return tmpRes;
853
854         } else
855                 return utf_new(text, strlen(text));
856 }
857
858
859 /* utf_nextu2 ******************************************************************
860
861    Read the next unicode character from the utf string and increment
862    the utf-string pointer accordingly.
863
864    CAUTION: This function is unsafe for input that was not checked 
865             by is_valid_utf!
866
867 *******************************************************************************/
868
869 u2 utf_nextu2(char **utf_ptr)
870 {
871     /* uncompressed unicode character */
872     u2 unicode_char = 0;
873     /* current position in utf text */  
874     unsigned char *utf = (unsigned char *) (*utf_ptr);
875     /* bytes representing the unicode character */
876     unsigned char ch1, ch2, ch3;
877     /* number of bytes used to represent the unicode character */
878     int len = 0;
879         
880     switch ((ch1 = utf[0]) >> 4) {
881         default: /* 1 byte */
882                 (*utf_ptr)++;
883                 return (u2) ch1;
884         case 0xC: 
885         case 0xD: /* 2 bytes */
886                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
887                         unsigned char high = ch1 & 0x1F;
888                         unsigned char low  = ch2 & 0x3F;
889                         unicode_char = (high << 6) + low;
890                         len = 2;
891                 }
892                 break;
893
894         case 0xE: /* 2 or 3 bytes */
895                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
896                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
897                                 unsigned char low  = ch3 & 0x3f;
898                                 unsigned char mid  = ch2 & 0x3f;
899                                 unsigned char high = ch1 & 0x0f;
900                                 unicode_char = (((high << 6) + mid) << 6) + low;
901                                 len = 3;
902                         } else
903                                 len = 2;                                           
904                 }
905                 break;
906     }
907
908     /* update position in utf-text */
909     *utf_ptr = (char *) (utf + len);
910
911     return unicode_char;
912 }
913
914
915 /* utf_bytes *******************************************************************
916
917    Determine number of bytes (aka. octets) in the utf string.
918
919    IN:
920       u............utf string
921
922    OUT:
923       The number of octets of this utf string.
924           There is _no_ terminating zero included in this count.
925
926 *******************************************************************************/
927
928 u4 utf_bytes(utf *u)
929 {
930         return u->blength;
931 }
932
933
934 /* utf_get_number_of_u2s_for_buffer ********************************************
935
936    Determine number of UTF-16 u2s in the given UTF-8 buffer
937
938    CAUTION: This function is unsafe for input that was not checked 
939             by is_valid_utf!
940
941    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
942    to an array of u2s (UTF-16) and want to know how many of them you will get.
943    All other uses of this function are probably wrong.
944
945    IN:
946       buffer........points to first char in buffer
947           blength.......number of _bytes_ in the buffer
948
949    OUT:
950       the number of u2s needed to hold this string in UTF-16 encoding.
951           There is _no_ terminating zero included in this count.
952
953    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
954    exception.
955
956 *******************************************************************************/
957
958 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
959 {
960         const char *endpos;                 /* points behind utf string           */
961         const char *utf_ptr;                /* current position in utf text       */
962         u4 len = 0;                         /* number of unicode characters       */
963
964         utf_ptr = buffer;
965         endpos = utf_ptr + blength;
966
967         while (utf_ptr < endpos) {
968                 len++;
969                 /* next unicode character */
970                 utf_nextu2((char **)&utf_ptr);
971         }
972
973         assert(utf_ptr == endpos);
974
975         return len;
976 }
977
978
979 /* utf_get_number_of_u2s *******************************************************
980
981    Determine number of UTF-16 u2s in the utf string.
982
983    CAUTION: This function is unsafe for input that was not checked 
984             by is_valid_utf!
985
986    CAUTION: Use this function *only* when you want to convert a utf string
987    to an array of u2s and want to know how many of them you will get.
988    All other uses of this function are probably wrong.
989
990    IN:
991       u............utf string
992
993    OUT:
994       the number of u2s needed to hold this string in UTF-16 encoding.
995           There is _no_ terminating zero included in this count.
996           XXX 0 if a NullPointerException has been thrown (see below)
997
998 *******************************************************************************/
999
1000 u4 utf_get_number_of_u2s(utf *u)
1001 {
1002         char *endpos;                       /* points behind utf string           */
1003         char *utf_ptr;                      /* current position in utf text       */
1004         u4 len = 0;                         /* number of unicode characters       */
1005
1006         /* XXX this is probably not checked by most callers! Review this after */
1007         /* the invalid uses of this function have been eliminated */
1008         if (u == NULL) {
1009                 exceptions_throw_nullpointerexception();
1010                 return 0;
1011         }
1012
1013         endpos = UTF_END(u);
1014         utf_ptr = u->text;
1015
1016         while (utf_ptr < endpos) {
1017                 len++;
1018                 /* next unicode character */
1019                 utf_nextu2(&utf_ptr);
1020         }
1021
1022         if (utf_ptr != endpos) {
1023                 /* string ended abruptly */
1024                 exceptions_throw_internalerror("Illegal utf8 string");
1025                 return 0;
1026         }
1027
1028         return len;
1029 }
1030
1031
1032 /* utf8_safe_number_of_u2s *****************************************************
1033
1034    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1035    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1036
1037    This function is safe even for invalid UTF-8 strings.
1038
1039    IN:
1040       text..........zero-terminated(!) UTF-8 string (may be invalid)
1041                         must NOT be NULL
1042           nbytes........strlen(text). (This is needed to completely emulate
1043                         the RI).
1044
1045    OUT:
1046       the number of u2s needed to hold this string in UTF-16 encoding.
1047           There is _no_ terminating zero included in this count.
1048
1049 *******************************************************************************/
1050
1051 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1052         register const unsigned char *t;
1053         register s4 byte;
1054         register s4 len;
1055         register const unsigned char *tlimit;
1056         s4 byte1;
1057         s4 byte2;
1058         s4 byte3;
1059         s4 value;
1060         s4 skip;
1061
1062         assert(text);
1063         assert(nbytes >= 0);
1064
1065         len = 0;
1066         t = (const unsigned char *) text;
1067         tlimit = t + nbytes;
1068
1069         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1070
1071         while (1) {
1072                 byte = *t++;
1073
1074                 if (byte & 0x80) {
1075                         /* highest bit set, non-ASCII character */
1076
1077                         if ((byte & 0xe0) == 0xc0) {
1078                                 /* 2-byte: should be 110..... 10...... ? */
1079
1080                                 if ((*t++ & 0xc0) == 0x80)
1081                                         ; /* valid 2-byte */
1082                                 else
1083                                         t--; /* invalid */
1084                         }
1085                         else if ((byte & 0xf0) == 0xe0) {
1086                                 /* 3-byte: should be 1110.... 10...... 10...... */
1087                                 /*                            ^t                */
1088
1089                                 if (t + 2 > tlimit)
1090                                         return len + 1; /* invalid, stop here */
1091
1092                                 if ((*t++ & 0xc0) == 0x80) {
1093                                         if ((*t++ & 0xc0) == 0x80)
1094                                                 ; /* valid 3-byte */
1095                                         else
1096                                                 t--; /* invalid */
1097                                 }
1098                                 else
1099                                         t--; /* invalid */
1100                         }
1101                         else if ((byte & 0xf8) == 0xf0) {
1102                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1103                                 /*                            ^t                         */
1104
1105                                 if (t + 3 > tlimit)
1106                                         return len + 1; /* invalid, stop here */
1107
1108                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1109                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1110                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1111                                                         /* valid 4-byte UTF-8? */
1112                                                         value = ((byte  & 0x07) << 18)
1113                                                                   | ((byte1 & 0x3f) << 12)
1114                                                                   | ((byte2 & 0x3f) <<  6)
1115                                                                   | ((byte3 & 0x3f)      );
1116
1117                                                         if (value > 0x10FFFF)
1118                                                                 ; /* invalid */
1119                                                         else if (value > 0xFFFF)
1120                                                                 len += 1; /* we need surrogates */
1121                                                         else
1122                                                                 ; /* 16bit suffice */
1123                                                 }
1124                                                 else
1125                                                         t--; /* invalid */
1126                                         }
1127                                         else
1128                                                 t--; /* invalid */
1129                                 }
1130                                 else
1131                                         t--; /* invalid */
1132                         }
1133                         else if ((byte & 0xfc) == 0xf8) {
1134                                 /* invalid 5-byte */
1135                                 if (t + 4 > tlimit)
1136                                         return len + 1; /* invalid, stop here */
1137
1138                                 skip = 4;
1139                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1140                                         t++;
1141                         }
1142                         else if ((byte & 0xfe) == 0xfc) {
1143                                 /* invalid 6-byte */
1144                                 if (t + 5 > tlimit)
1145                                         return len + 1; /* invalid, stop here */
1146
1147                                 skip = 5;
1148                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1149                                         t++;
1150                         }
1151                         else
1152                                 ; /* invalid */
1153                 }
1154                 else {
1155                         /* NUL */
1156
1157                         if (byte == 0)
1158                                 break;
1159
1160                         /* ASCII character, common case */
1161                 }
1162
1163                 len++;
1164         }
1165
1166         return len;
1167 }
1168
1169
1170 /* utf8_safe_convert_to_u2s ****************************************************
1171
1172    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1173    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1174    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1175
1176    This function is safe even for invalid UTF-8 strings.
1177
1178    IN:
1179       text..........zero-terminated(!) UTF-8 string (may be invalid)
1180                         must NOT be NULL
1181           nbytes........strlen(text). (This is needed to completely emulate
1182                                         the RI).
1183           buffer........a preallocated array of u2s to receive the decoded
1184                         string. Use utf8_safe_number_of_u2s to get the
1185                                         required number of u2s for allocating this.
1186
1187 *******************************************************************************/
1188
1189 #define UNICODE_REPLACEMENT  0xfffd
1190
1191 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1192         register const unsigned char *t;
1193         register s4 byte;
1194         register const unsigned char *tlimit;
1195         s4 byte1;
1196         s4 byte2;
1197         s4 byte3;
1198         s4 value;
1199         s4 skip;
1200
1201         assert(text);
1202         assert(nbytes >= 0);
1203
1204         t = (const unsigned char *) text;
1205         tlimit = t + nbytes;
1206
1207         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1208
1209         while (1) {
1210                 byte = *t++;
1211
1212                 if (byte & 0x80) {
1213                         /* highest bit set, non-ASCII character */
1214
1215                         if ((byte & 0xe0) == 0xc0) {
1216                                 /* 2-byte: should be 110..... 10...... */
1217
1218                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1219                                         /* valid 2-byte UTF-8 */
1220                                         *buffer++ = ((byte  & 0x1f) << 6)
1221                                                           | ((byte1 & 0x3f)     );
1222                                 }
1223                                 else {
1224                                         *buffer++ = UNICODE_REPLACEMENT;
1225                                         t--;
1226                                 }
1227                         }
1228                         else if ((byte & 0xf0) == 0xe0) {
1229                                 /* 3-byte: should be 1110.... 10...... 10...... */
1230
1231                                 if (t + 2 > tlimit) {
1232                                         *buffer++ = UNICODE_REPLACEMENT;
1233                                         return;
1234                                 }
1235
1236                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1237                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1238                                                 /* valid 3-byte UTF-8 */
1239                                                 *buffer++ = ((byte  & 0x0f) << 12)
1240                                                                   | ((byte1 & 0x3f) <<  6)
1241                                                                   | ((byte2 & 0x3f)      );
1242                                         }
1243                                         else {
1244                                                 *buffer++ = UNICODE_REPLACEMENT;
1245                                                 t--;
1246                                         }
1247                                 }
1248                                 else {
1249                                         *buffer++ = UNICODE_REPLACEMENT;
1250                                         t--;
1251                                 }
1252                         }
1253                         else if ((byte & 0xf8) == 0xf0) {
1254                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1255
1256                                 if (t + 3 > tlimit) {
1257                                         *buffer++ = UNICODE_REPLACEMENT;
1258                                         return;
1259                                 }
1260
1261                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1262                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1263                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1264                                                         /* valid 4-byte UTF-8? */
1265                                                         value = ((byte  & 0x07) << 18)
1266                                                                   | ((byte1 & 0x3f) << 12)
1267                                                                   | ((byte2 & 0x3f) <<  6)
1268                                                                   | ((byte3 & 0x3f)      );
1269
1270                                                         if (value > 0x10FFFF) {
1271                                                                 *buffer++ = UNICODE_REPLACEMENT;
1272                                                         }
1273                                                         else if (value > 0xFFFF) {
1274                                                                 /* we need surrogates */
1275                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1276                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1277                                                         }
1278                                                         else
1279                                                                 *buffer++ = value; /* 16bit suffice */
1280                                                 }
1281                                                 else {
1282                                                         *buffer++ = UNICODE_REPLACEMENT;
1283                                                         t--;
1284                                                 }
1285                                         }
1286                                         else {
1287                                                 *buffer++ = UNICODE_REPLACEMENT;
1288                                                 t--;
1289                                         }
1290                                 }
1291                                 else {
1292                                         *buffer++ = UNICODE_REPLACEMENT;
1293                                         t--;
1294                                 }
1295                         }
1296                         else if ((byte & 0xfc) == 0xf8) {
1297                                 if (t + 4 > tlimit) {
1298                                         *buffer++ = UNICODE_REPLACEMENT;
1299                                         return;
1300                                 }
1301
1302                                 skip = 4;
1303                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1304                                         t++;
1305                                 *buffer++ = UNICODE_REPLACEMENT;
1306                         }
1307                         else if ((byte & 0xfe) == 0xfc) {
1308                                 if (t + 5 > tlimit) {
1309                                         *buffer++ = UNICODE_REPLACEMENT;
1310                                         return;
1311                                 }
1312
1313                                 skip = 5;
1314                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1315                                         t++;
1316                                 *buffer++ = UNICODE_REPLACEMENT;
1317                         }
1318                         else
1319                                 *buffer++ = UNICODE_REPLACEMENT;
1320                 }
1321                 else {
1322                         /* NUL */
1323
1324                         if (byte == 0)
1325                                 break;
1326
1327                         /* ASCII character, common case */
1328
1329                         *buffer++ = byte;
1330                 }
1331         }
1332 }
1333
1334
1335 /* u2_utflength ****************************************************************
1336
1337    Returns the utf length in bytes of a u2 array.
1338
1339 *******************************************************************************/
1340
1341 u4 u2_utflength(u2 *text, u4 u2_length)
1342 {
1343         u4 result_len = 0;                  /* utf length in bytes                */
1344         u2 ch;                              /* current unicode character          */
1345         u4 len;
1346         
1347         for (len = 0; len < u2_length; len++) {
1348                 /* next unicode character */
1349                 ch = *text++;
1350           
1351                 /* determine bytes required to store unicode character as utf */
1352                 if (ch && (ch < 0x80)) 
1353                         result_len++;
1354                 else if (ch < 0x800)
1355                         result_len += 2;        
1356                 else 
1357                         result_len += 3;        
1358         }
1359
1360     return result_len;
1361 }
1362
1363
1364 /* utf_copy ********************************************************************
1365
1366    Copy the given utf string byte-for-byte to a buffer.
1367
1368    IN:
1369       buffer.......the buffer
1370           u............the utf string
1371
1372 *******************************************************************************/
1373
1374 void utf_copy(char *buffer, utf *u)
1375 {
1376         /* our utf strings are zero-terminated (done by utf_new) */
1377         MCOPY(buffer, u->text, char, u->blength + 1);
1378 }
1379
1380
1381 /* utf_cat *********************************************************************
1382
1383    Append the given utf string byte-for-byte to a buffer.
1384
1385    IN:
1386       buffer.......the buffer
1387           u............the utf string
1388
1389 *******************************************************************************/
1390
1391 void utf_cat(char *buffer, utf *u)
1392 {
1393         /* our utf strings are zero-terminated (done by utf_new) */
1394         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1395 }
1396
1397
1398 /* utf_copy_classname **********************************************************
1399
1400    Copy the given utf classname byte-for-byte to a buffer.
1401    '/' is replaced by '.'
1402
1403    IN:
1404       buffer.......the buffer
1405           u............the utf string
1406
1407 *******************************************************************************/
1408
1409 void utf_copy_classname(char *buffer, utf *u)
1410 {
1411         char *bufptr;
1412         char *srcptr;
1413         char *endptr;
1414         char ch;
1415
1416         bufptr = buffer;
1417         srcptr = u->text;
1418         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1419
1420         while (srcptr != endptr) {
1421                 ch = *srcptr++;
1422                 if (ch == '/')
1423                         ch = '.';
1424                 *bufptr++ = ch;
1425         }
1426 }
1427
1428
1429 /* utf_cat *********************************************************************
1430
1431    Append the given utf classname byte-for-byte to a buffer.
1432    '/' is replaced by '.'
1433
1434    IN:
1435       buffer.......the buffer
1436           u............the utf string
1437
1438 *******************************************************************************/
1439
1440 void utf_cat_classname(char *buffer, utf *u)
1441 {
1442         utf_copy_classname(buffer + strlen(buffer), u);
1443 }
1444
1445 /* utf_display_printable_ascii *************************************************
1446
1447    Write utf symbol to stdout (for debugging purposes).
1448    Non-printable and non-ASCII characters are printed as '?'.
1449
1450 *******************************************************************************/
1451
1452 void utf_display_printable_ascii(utf *u)
1453 {
1454         char *endpos;                       /* points behind utf string           */
1455         char *utf_ptr;                      /* current position in utf text       */
1456
1457         if (u == NULL) {
1458                 printf("NULL");
1459                 fflush(stdout);
1460                 return;
1461         }
1462
1463         endpos = UTF_END(u);
1464         utf_ptr = u->text;
1465
1466         while (utf_ptr < endpos) {
1467                 /* read next unicode character */
1468
1469                 u2 c = utf_nextu2(&utf_ptr);
1470
1471                 if ((c >= 32) && (c <= 127))
1472                         printf("%c", c);
1473                 else
1474                         printf("?");
1475         }
1476
1477         fflush(stdout);
1478 }
1479
1480
1481 /* utf_display_printable_ascii_classname ***************************************
1482
1483    Write utf symbol to stdout with `/' converted to `.' (for debugging
1484    purposes).
1485    Non-printable and non-ASCII characters are printed as '?'.
1486
1487 *******************************************************************************/
1488
1489 void utf_display_printable_ascii_classname(utf *u)
1490 {
1491         char *endpos;                       /* points behind utf string           */
1492         char *utf_ptr;                      /* current position in utf text       */
1493
1494         if (u == NULL) {
1495                 printf("NULL");
1496                 fflush(stdout);
1497                 return;
1498         }
1499
1500         endpos = UTF_END(u);
1501         utf_ptr = u->text;
1502
1503         while (utf_ptr < endpos) {
1504                 /* read next unicode character */
1505
1506                 u2 c = utf_nextu2(&utf_ptr);
1507
1508                 if (c == '/')
1509                         c = '.';
1510
1511                 if ((c >= 32) && (c <= 127))
1512                         printf("%c", c);
1513                 else
1514                         printf("?");
1515         }
1516
1517         fflush(stdout);
1518 }
1519
1520
1521 /* utf_sprint_convert_to_latin1 ************************************************
1522         
1523    Write utf symbol into c-string (for debugging purposes).
1524    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1525    invalid results.
1526
1527 *******************************************************************************/
1528
1529 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1530 {
1531         char *endpos;                       /* points behind utf string           */
1532         char *utf_ptr;                      /* current position in utf text       */
1533         u2 pos = 0;                         /* position in c-string               */
1534
1535         if (!u) {
1536                 strcpy(buffer, "NULL");
1537                 return;
1538         }
1539
1540         endpos = UTF_END(u);
1541         utf_ptr = u->text;
1542
1543         while (utf_ptr < endpos) 
1544                 /* copy next unicode character */       
1545                 buffer[pos++] = utf_nextu2(&utf_ptr);
1546
1547         /* terminate string */
1548         buffer[pos] = '\0';
1549 }
1550
1551
1552 /* utf_sprint_convert_to_latin1_classname **************************************
1553         
1554    Write utf symbol into c-string with `/' converted to `.' (for debugging
1555    purposes).
1556    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1557    invalid results.
1558
1559 *******************************************************************************/
1560
1561 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1562 {
1563         char *endpos;                       /* points behind utf string           */
1564         char *utf_ptr;                      /* current position in utf text       */
1565         u2 pos = 0;                         /* position in c-string               */
1566
1567         if (!u) {
1568                 strcpy(buffer, "NULL");
1569                 return;
1570         }
1571
1572         endpos = UTF_END(u);
1573         utf_ptr = u->text;
1574
1575         while (utf_ptr < endpos) {
1576                 /* copy next unicode character */       
1577                 u2 c = utf_nextu2(&utf_ptr);
1578                 if (c == '/') c = '.';
1579                 buffer[pos++] = c;
1580         }
1581
1582         /* terminate string */
1583         buffer[pos] = '\0';
1584 }
1585
1586
1587 /* utf_strcat_convert_to_latin1 ************************************************
1588         
1589    Like libc strcat, but uses an utf8 string.
1590    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1591    invalid results.
1592
1593 *******************************************************************************/
1594
1595 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1596 {
1597         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1598 }
1599
1600
1601 /* utf_strcat_convert_to_latin1_classname **************************************
1602         
1603    Like libc strcat, but uses an utf8 string.
1604    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1605    invalid results.
1606
1607 *******************************************************************************/
1608
1609 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1610 {
1611         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1612 }
1613
1614
1615 /* utf_fprint_printable_ascii **************************************************
1616         
1617    Write utf symbol into file.
1618    Non-printable and non-ASCII characters are printed as '?'.
1619
1620 *******************************************************************************/
1621
1622 void utf_fprint_printable_ascii(FILE *file, utf *u)
1623 {
1624         char *endpos;                       /* points behind utf string           */
1625         char *utf_ptr;                      /* current position in utf text       */
1626
1627         if (!u)
1628                 return;
1629
1630         endpos = UTF_END(u);
1631         utf_ptr = u->text;
1632
1633         while (utf_ptr < endpos) { 
1634                 /* read next unicode character */                
1635                 u2 c = utf_nextu2(&utf_ptr);                            
1636
1637                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1638                 else fprintf(file, "?");
1639         }
1640 }
1641
1642
1643 /* utf_fprint_printable_ascii_classname ****************************************
1644         
1645    Write utf symbol into file with `/' converted to `.'.
1646    Non-printable and non-ASCII characters are printed as '?'.
1647
1648 *******************************************************************************/
1649
1650 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1651 {
1652         char *endpos;                       /* points behind utf string           */
1653         char *utf_ptr;                      /* current position in utf text       */
1654
1655     if (!u)
1656                 return;
1657
1658         endpos = UTF_END(u);
1659         utf_ptr = u->text;
1660
1661         while (utf_ptr < endpos) { 
1662                 /* read next unicode character */                
1663                 u2 c = utf_nextu2(&utf_ptr);                            
1664                 if (c == '/') c = '.';
1665
1666                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1667                 else fprintf(file, "?");
1668         }
1669 }
1670
1671
1672 /* is_valid_utf ****************************************************************
1673
1674    Return true if the given string is a valid UTF-8 string.
1675
1676    utf_ptr...points to first character
1677    end_pos...points after last character
1678
1679 *******************************************************************************/
1680
1681 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1682
1683 bool is_valid_utf(char *utf_ptr, char *end_pos)
1684 {
1685         int bytes;
1686         int len,i;
1687         char c;
1688         unsigned long v;
1689
1690         if (end_pos < utf_ptr) return false;
1691         bytes = end_pos - utf_ptr;
1692         while (bytes--) {
1693                 c = *utf_ptr++;
1694
1695                 if (!c) return false;                     /* 0x00 is not allowed */
1696                 if ((c & 0x80) == 0) continue;            /* ASCII */
1697
1698                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1699                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1700                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1701                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1702                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1703                 else return false;                        /* invalid leading byte */
1704
1705                 if (len > 2) return false;                /* Java limitation */
1706
1707                 v = (unsigned long)c & (0x3f >> len);
1708                 
1709                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1710
1711                 for (i = len; i--; ) {
1712                         c = *utf_ptr++;
1713                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1714                                 return false;
1715                         v = (v << 6) | (c & 0x3f);
1716                 }
1717
1718                 if (v == 0) {
1719                         if (len != 1) return false;           /* Java special */
1720
1721                 } else {
1722                         /* Sun Java seems to allow overlong UTF-8 encodings */
1723                         
1724                         /* if (v < min_codepoint[len]) */
1725                                 /* XXX throw exception? */
1726                 }
1727
1728                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1729                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1730
1731                 /* even these seem to be allowed */
1732                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1733         }
1734
1735         return true;
1736 }
1737
1738
1739 /* is_valid_name ***************************************************************
1740
1741    Return true if the given string may be used as a class/field/method
1742    name. (Currently this only disallows empty strings and control
1743    characters.)
1744
1745    NOTE: The string is assumed to have passed is_valid_utf!
1746
1747    utf_ptr...points to first character
1748    end_pos...points after last character
1749
1750 *******************************************************************************/
1751
1752 bool is_valid_name(char *utf_ptr, char *end_pos)
1753 {
1754         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1755
1756         while (utf_ptr < end_pos) {
1757                 unsigned char c = *utf_ptr++;
1758
1759                 if (c < 0x20) return false; /* disallow control characters */
1760                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1761                         return false;
1762         }
1763
1764         return true;
1765 }
1766
1767 bool is_valid_name_utf(utf *u)
1768 {
1769         return is_valid_name(u->text, UTF_END(u));
1770 }
1771
1772
1773 /* utf_show ********************************************************************
1774
1775    Writes the utf symbols in the utfhash to stdout and displays the
1776    number of external hash chains grouped according to the chainlength
1777    (for debugging purposes).
1778
1779 *******************************************************************************/
1780
1781 #if !defined(NDEBUG)
1782 void utf_show(void)
1783 {
1784
1785 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1786
1787         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1788         u4 max_chainlength = 0;      /* maximum length of the chains */
1789         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1790         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1791         u4 i;
1792
1793         printf("UTF-HASH:\n");
1794
1795         /* show element of utf-hashtable */
1796
1797         for (i = 0; i < hashtable_utf->size; i++) {
1798                 utf *u = hashtable_utf->ptr[i];
1799
1800                 if (u) {
1801                         printf("SLOT %d: ", (int) i);
1802
1803                         while (u) {
1804                                 printf("'");
1805                                 utf_display_printable_ascii(u);
1806                                 printf("' ");
1807                                 u = u->hashlink;
1808                         }       
1809                         printf("\n");
1810                 }
1811         }
1812
1813         printf("UTF-HASH: %d slots for %d entries\n", 
1814                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1815
1816         if (hashtable_utf->entries == 0)
1817                 return;
1818
1819         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1820
1821         for (i=0;i<CHAIN_LIMIT;i++)
1822                 chain_count[i]=0;
1823
1824         /* count numbers of hashchains according to their length */
1825         for (i=0; i<hashtable_utf->size; i++) {
1826                   
1827                 utf *u = (utf*) hashtable_utf->ptr[i];
1828                 u4 chain_length = 0;
1829
1830                 /* determine chainlength */
1831                 while (u) {
1832                         u = u->hashlink;
1833                         chain_length++;
1834                 }
1835
1836                 /* update sum of all chainlengths */
1837                 sum_chainlength+=chain_length;
1838
1839                 /* determine the maximum length of the chains */
1840                 if (chain_length>max_chainlength)
1841                         max_chainlength = chain_length;
1842
1843                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1844                 if (chain_length>=CHAIN_LIMIT) {
1845                         beyond_limit+=chain_length;
1846                         chain_length=CHAIN_LIMIT-1;
1847                 }
1848
1849                 /* update number of hashchains of current length */
1850                 chain_count[chain_length]++;
1851         }
1852
1853         /* display results */  
1854         for (i=1;i<CHAIN_LIMIT-1;i++) 
1855                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1856           
1857         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1858
1859
1860         printf("max. chainlength:%5d\n",max_chainlength);
1861
1862         /* avg. chainlength = sum of chainlengths / number of chains */
1863         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1864 }
1865 #endif /* !defined(NDEBUG) */
1866
1867
1868 /*
1869  * These are local overrides for various environment variables in Emacs.
1870  * Please do not remove this and leave it at the end of the file, where
1871  * Emacs will automagically detect them.
1872  * ---------------------------------------------------------------------
1873  * Local variables:
1874  * mode: c
1875  * indent-tabs-mode: t
1876  * c-basic-offset: 4
1877  * tab-width: 4
1878  * End:
1879  * vim:noexpandtab:sw=4:ts=4:
1880  */