65a4126568349fa2784c11d278f46815c4aef54c
[cacao.git] / src / vmcore / utf8.c
1 /* src/vmcore/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    $Id: utf8.c 7813 2007-04-25 19:20:13Z twisti $
26
27 */
28
29
30 #include "config.h"
31
32 #include <string.h>
33 #include <assert.h>
34
35 #include "vm/types.h"
36
37 #include "mm/memory.h"
38
39 #include "threads/lock-common.h"
40
41 #include "toolbox/hashtable.h"
42
43 #include "vm/exceptions.h"
44
45 #include "vmcore/options.h"
46
47 #if defined(ENABLE_STATISTICS)
48 # include "vmcore/statistics.h"
49 #endif
50
51 #include "vmcore/utf8.h"
52
53
54 /* global variables ***********************************************************/
55
56 /* hashsize must be power of 2 */
57
58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
59
60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
61
62
63 /* utf-symbols for pointer comparison of frequently used strings **************/
64
65 utf *utf_java_lang_Object;
66
67 utf *utf_java_lang_Class;
68 utf *utf_java_lang_ClassLoader;
69 utf *utf_java_lang_Cloneable;
70 utf *utf_java_lang_SecurityManager;
71 utf *utf_java_lang_String;
72 utf *utf_java_lang_System;
73 utf *utf_java_lang_ThreadGroup;
74 utf *utf_java_lang_ref_SoftReference;
75 utf *utf_java_lang_ref_WeakReference;
76 utf *utf_java_lang_ref_PhantomReference;
77 utf *utf_java_io_Serializable;
78
79 utf *utf_java_lang_Throwable;
80 utf *utf_java_lang_Error;
81
82 utf *utf_java_lang_AbstractMethodError;
83 utf *utf_java_lang_ClassCircularityError;
84 utf *utf_java_lang_ClassFormatError;
85 utf *utf_java_lang_ExceptionInInitializerError;
86 utf *utf_java_lang_IncompatibleClassChangeError;
87 utf *utf_java_lang_InstantiationError;
88 utf *utf_java_lang_InternalError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchFieldError;
92 utf *utf_java_lang_NoSuchMethodError;
93 utf *utf_java_lang_OutOfMemoryError;
94 utf *utf_java_lang_UnsatisfiedLinkError;
95 utf *utf_java_lang_UnsupportedClassVersionError;
96 utf *utf_java_lang_VerifyError;
97 utf *utf_java_lang_VirtualMachineError;
98
99 #if defined(WITH_CLASSPATH_GNU)
100 utf *utf_java_lang_VMThrowable;
101 #endif
102
103 utf *utf_java_lang_Exception;
104
105 utf *utf_java_lang_ArithmeticException;
106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
107 utf *utf_java_lang_ArrayStoreException;
108 utf *utf_java_lang_ClassCastException;
109 utf *utf_java_lang_ClassNotFoundException;
110 utf *utf_java_lang_CloneNotSupportedException;
111 utf *utf_java_lang_IllegalAccessException;
112 utf *utf_java_lang_IllegalArgumentException;
113 utf *utf_java_lang_IllegalMonitorStateException;
114 utf *utf_java_lang_InstantiationException;
115 utf *utf_java_lang_InterruptedException;
116 utf *utf_java_lang_NegativeArraySizeException;
117 utf *utf_java_lang_NullPointerException;
118 utf *utf_java_lang_StringIndexOutOfBoundsException;
119
120 utf *utf_java_lang_reflect_InvocationTargetException;
121
122 #if defined(ENABLE_JAVASE)
123 utf* utf_java_lang_Void;
124 #endif
125
126 utf* utf_java_lang_Boolean;
127 utf* utf_java_lang_Byte;
128 utf* utf_java_lang_Character;
129 utf* utf_java_lang_Short;
130 utf* utf_java_lang_Integer;
131 utf* utf_java_lang_Long;
132 utf* utf_java_lang_Float;
133 utf* utf_java_lang_Double;
134
135 #if defined(ENABLE_JAVASE)
136 utf *utf_java_lang_StackTraceElement;
137 utf *utf_java_lang_reflect_Constructor;
138 utf *utf_java_lang_reflect_Field;
139 utf *utf_java_lang_reflect_Method;
140 utf *utf_java_util_Vector;
141 #endif
142
143 utf *utf_InnerClasses;                  /* InnerClasses                       */
144 utf *utf_ConstantValue;                 /* ConstantValue                      */
145 utf *utf_Code;                          /* Code                               */
146 utf *utf_Exceptions;                    /* Exceptions                         */
147 utf *utf_LineNumberTable;               /* LineNumberTable                    */
148 utf *utf_SourceFile;                    /* SourceFile                         */
149
150 #if defined(ENABLE_JAVASE)
151 utf *utf_EnclosingMethod;
152 utf *utf_Signature;
153 utf *utf_RuntimeVisibleAnnotations;
154 utf *utf_StackMapTable;
155 #endif
156
157 utf *utf_init;                          /* <init>                             */
158 utf *utf_clinit;                        /* <clinit>                           */
159 utf *utf_clone;                         /* clone                              */
160 utf *utf_finalize;                      /* finalize                           */
161 utf *utf_run;                           /* run                                */
162
163 utf *utf_add;
164 utf *utf_remove;
165 utf *utf_addThread;
166 utf *utf_removeThread;
167 utf *utf_put;
168 utf *utf_get;
169 utf *utf_value;
170
171 utf *utf_fillInStackTrace;
172 utf *utf_getSystemClassLoader;
173 utf *utf_loadClass;
174 utf *utf_printStackTrace;
175
176 utf *utf_Z;                             /* Z                                  */
177 utf *utf_B;                             /* B                                  */
178 utf *utf_C;                             /* C                                  */
179 utf *utf_S;                             /* S                                  */
180 utf *utf_I;                             /* I                                  */
181 utf *utf_J;                             /* J                                  */
182 utf *utf_F;                             /* F                                  */
183 utf *utf_D;                             /* D                                  */
184
185 utf *utf_void__void;                    /* ()V                                */
186 utf *utf_boolean__void;                 /* (Z)V                               */
187 utf *utf_byte__void;                    /* (B)V                               */
188 utf *utf_char__void;                    /* (C)V                               */
189 utf *utf_short__void;                   /* (S)V                               */
190 utf *utf_int__void;                     /* (I)V                               */
191 utf *utf_long__void;                    /* (J)V                               */
192 utf *utf_float__void;                   /* (F)V                               */
193 utf *utf_double__void;                  /* (D)V                               */
194
195 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
196 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
197 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
198 utf *utf_java_lang_Object__java_lang_Object;
199 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
200 utf *utf_java_lang_String__java_lang_Class;
201 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
202 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
203
204 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
205 utf *utf_null;
206 utf *array_packagename;
207
208
209 /* utf_init ********************************************************************
210
211    Initializes the utf8 subsystem.
212
213 *******************************************************************************/
214
215 bool utf8_init(void)
216 {
217         /* create utf8 hashtable */
218
219         hashtable_utf = NEW(hashtable);
220
221         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
222
223 #if defined(ENABLE_STATISTICS)
224         if (opt_stat)
225                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
226 #endif
227
228         /* create utf-symbols for pointer comparison of frequently used strings */
229
230         utf_java_lang_Object           = utf_new_char("java/lang/Object");
231
232         utf_java_lang_Class            = utf_new_char("java/lang/Class");
233         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
234         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
235         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
236         utf_java_lang_String           = utf_new_char("java/lang/String");
237         utf_java_lang_System           = utf_new_char("java/lang/System");
238         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
239
240         utf_java_lang_ref_SoftReference =
241                 utf_new_char("java/lang/ref/SoftReference");
242
243         utf_java_lang_ref_WeakReference =
244                 utf_new_char("java/lang/ref/WeakReference");
245
246         utf_java_lang_ref_PhantomReference =
247                 utf_new_char("java/lang/ref/PhantomReference");
248
249         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
250
251         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
252         utf_java_lang_Error            = utf_new_char("java/lang/Error");
253
254         utf_java_lang_ClassCircularityError =
255                 utf_new_char("java/lang/ClassCircularityError");
256
257         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
258
259         utf_java_lang_ExceptionInInitializerError =
260                 utf_new_char("java/lang/ExceptionInInitializerError");
261
262         utf_java_lang_IncompatibleClassChangeError =
263                 utf_new_char("java/lang/IncompatibleClassChangeError");
264
265         utf_java_lang_InstantiationError =
266                 utf_new_char("java/lang/InstantiationError");
267
268         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
269         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
270
271         utf_java_lang_NoClassDefFoundError =
272                 utf_new_char("java/lang/NoClassDefFoundError");
273
274         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
275
276         utf_java_lang_UnsatisfiedLinkError =
277                 utf_new_char("java/lang/UnsatisfiedLinkError");
278
279         utf_java_lang_UnsupportedClassVersionError =
280                 utf_new_char("java/lang/UnsupportedClassVersionError");
281
282         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
283
284         utf_java_lang_VirtualMachineError =
285                 utf_new_char("java/lang/VirtualMachineError");
286
287 #if defined(ENABLE_JAVASE)
288         utf_java_lang_AbstractMethodError =
289                 utf_new_char("java/lang/AbstractMethodError");
290
291         utf_java_lang_NoSuchFieldError =
292                 utf_new_char("java/lang/NoSuchFieldError");
293
294         utf_java_lang_NoSuchMethodError =
295                 utf_new_char("java/lang/NoSuchMethodError");
296 #endif
297
298 #if defined(WITH_CLASSPATH_GNU)
299         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
300 #endif
301
302         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
303
304         utf_java_lang_ArithmeticException =
305                 utf_new_char("java/lang/ArithmeticException");
306
307         utf_java_lang_ArrayIndexOutOfBoundsException =
308                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
309
310         utf_java_lang_ArrayStoreException =
311                 utf_new_char("java/lang/ArrayStoreException");
312
313         utf_java_lang_ClassCastException =
314                 utf_new_char("java/lang/ClassCastException");
315
316         utf_java_lang_ClassNotFoundException =
317                 utf_new_char("java/lang/ClassNotFoundException");
318
319         utf_java_lang_CloneNotSupportedException =
320                 utf_new_char("java/lang/CloneNotSupportedException");
321
322         utf_java_lang_IllegalAccessException =
323                 utf_new_char("java/lang/IllegalAccessException");
324
325         utf_java_lang_IllegalArgumentException =
326                 utf_new_char("java/lang/IllegalArgumentException");
327
328         utf_java_lang_IllegalMonitorStateException =
329                 utf_new_char("java/lang/IllegalMonitorStateException");
330
331         utf_java_lang_InstantiationException =
332                 utf_new_char("java/lang/InstantiationException");
333
334         utf_java_lang_InterruptedException =
335                 utf_new_char("java/lang/InterruptedException");
336
337         utf_java_lang_NegativeArraySizeException =
338                 utf_new_char("java/lang/NegativeArraySizeException");
339
340         utf_java_lang_NullPointerException =
341                 utf_new_char("java/lang/NullPointerException");
342
343         utf_java_lang_StringIndexOutOfBoundsException =
344                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
345
346         utf_java_lang_reflect_InvocationTargetException =
347                 utf_new_char("java/lang/reflect/InvocationTargetException");
348  
349 #if defined(ENABLE_JAVASE)
350         utf_java_lang_Void             = utf_new_char("java/lang/Void");
351 #endif
352
353         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
354         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
355         utf_java_lang_Character        = utf_new_char("java/lang/Character");
356         utf_java_lang_Short            = utf_new_char("java/lang/Short");
357         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
358         utf_java_lang_Long             = utf_new_char("java/lang/Long");
359         utf_java_lang_Float            = utf_new_char("java/lang/Float");
360         utf_java_lang_Double           = utf_new_char("java/lang/Double");
361
362 #if defined(ENABLE_JAVASE)
363         utf_java_lang_StackTraceElement =
364                 utf_new_char("java/lang/StackTraceElement");
365
366         utf_java_lang_reflect_Constructor =
367                 utf_new_char("java/lang/reflect/Constructor");
368
369         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
370         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
371         utf_java_util_Vector           = utf_new_char("java/util/Vector");
372 #endif
373
374         utf_InnerClasses               = utf_new_char("InnerClasses");
375         utf_ConstantValue              = utf_new_char("ConstantValue");
376         utf_Code                       = utf_new_char("Code");
377         utf_Exceptions                 = utf_new_char("Exceptions");
378         utf_LineNumberTable            = utf_new_char("LineNumberTable");
379         utf_SourceFile                 = utf_new_char("SourceFile");
380
381 #if defined(ENABLE_JAVASE)
382         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
383         utf_Signature                  = utf_new_char("Signature");
384         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
385         utf_StackMapTable              = utf_new_char("StackMapTable");
386 #endif
387
388         utf_init                           = utf_new_char("<init>");
389         utf_clinit                         = utf_new_char("<clinit>");
390         utf_clone                      = utf_new_char("clone");
391         utf_finalize                   = utf_new_char("finalize");
392         utf_run                        = utf_new_char("run");
393
394         utf_add                        = utf_new_char("add");
395         utf_remove                     = utf_new_char("remove");
396         utf_addThread                  = utf_new_char("addThread");
397         utf_removeThread               = utf_new_char("removeThread");
398         utf_put                        = utf_new_char("put");
399         utf_get                        = utf_new_char("get");
400         utf_value                      = utf_new_char("value");
401
402         utf_printStackTrace            = utf_new_char("printStackTrace");
403         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
404         utf_loadClass                  = utf_new_char("loadClass");
405         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
406
407         utf_Z                          = utf_new_char("Z");
408         utf_B                          = utf_new_char("B");
409         utf_C                          = utf_new_char("C");
410         utf_S                          = utf_new_char("S");
411         utf_I                          = utf_new_char("I");
412         utf_J                          = utf_new_char("J");
413         utf_F                          = utf_new_char("F");
414         utf_D                          = utf_new_char("D");
415
416         utf_void__void                 = utf_new_char("()V");
417         utf_boolean__void              = utf_new_char("(Z)V");
418         utf_byte__void                 = utf_new_char("(B)V");
419         utf_char__void                 = utf_new_char("(C)V");
420         utf_short__void                = utf_new_char("(S)V");
421         utf_int__void                  = utf_new_char("(I)V");
422         utf_long__void                 = utf_new_char("(J)V");
423         utf_float__void                = utf_new_char("(F)V");
424         utf_double__void               = utf_new_char("(D)V");
425         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
426         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
427
428         utf_void__java_lang_ClassLoader =
429                 utf_new_char("()Ljava/lang/ClassLoader;");
430
431         utf_java_lang_Object__java_lang_Object =
432                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
433
434         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
435
436         utf_java_lang_String__java_lang_Class =
437                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
438
439         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
440         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
441
442         utf_null                       = utf_new_char("null");
443         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
444         array_packagename              = utf_new_char("\t<the array package>");
445
446         /* everything's ok */
447
448         return true;
449 }
450
451
452 /* utf_hashkey *****************************************************************
453
454    The hashkey is computed from the utf-text by using up to 8
455    characters.  For utf-symbols longer than 15 characters 3 characters
456    are taken from the beginning and the end, 2 characters are taken
457    from the middle.
458
459 *******************************************************************************/
460
461 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
462 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
463
464 u4 utf_hashkey(const char *text, u4 length)
465 {
466         const char *start_pos = text;       /* pointer to utf text                */
467         u4 a;
468
469         switch (length) {
470         case 0: /* empty string */
471                 return 0;
472
473         case 1: return fbs(0);
474         case 2: return fbs(0) ^ nbs(3);
475         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
476         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
477         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
478         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
479         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
480         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
481
482         case 9:
483                 a = fbs(0);
484                 a ^= nbs(1);
485                 a ^= nbs(2);
486                 text++;
487                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
488
489         case 10:
490                 a = fbs(0);
491                 text++;
492                 a ^= nbs(2);
493                 a ^= nbs(3);
494                 a ^= nbs(4);
495                 text++;
496                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
497
498         case 11:
499                 a = fbs(0);
500                 text++;
501                 a ^= nbs(2);
502                 a ^= nbs(3);
503                 a ^= nbs(4);
504                 text++;
505                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
506
507         case 12:
508                 a = fbs(0);
509                 text += 2;
510                 a ^= nbs(2);
511                 a ^= nbs(3);
512                 text++;
513                 a ^= nbs(5);
514                 a ^= nbs(6);
515                 a ^= nbs(7);
516                 text++;
517                 return a ^ nbs(9) ^ nbs(10);
518
519         case 13:
520                 a = fbs(0);
521                 a ^= nbs(1);
522                 text++;
523                 a ^= nbs(3);
524                 a ^= nbs(4);
525                 text += 2;      
526                 a ^= nbs(7);
527                 a ^= nbs(8);
528                 text += 2;
529                 return a ^ nbs(9) ^ nbs(10);
530
531         case 14:
532                 a = fbs(0);
533                 text += 2;      
534                 a ^= nbs(3);
535                 a ^= nbs(4);
536                 text += 2;      
537                 a ^= nbs(7);
538                 a ^= nbs(8);
539                 text += 2;
540                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
541
542         case 15:
543                 a = fbs(0);
544                 text += 2;      
545                 a ^= nbs(3);
546                 a ^= nbs(4);
547                 text += 2;      
548                 a ^= nbs(7);
549                 a ^= nbs(8);
550                 text += 2;
551                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
552
553         default:  /* 3 characters from beginning */
554                 a = fbs(0);
555                 text += 2;
556                 a ^= nbs(3);
557                 a ^= nbs(4);
558
559                 /* 2 characters from middle */
560                 text = start_pos + (length / 2);
561                 a ^= fbs(5);
562                 text += 2;
563                 a ^= nbs(6);    
564
565                 /* 3 characters from end */
566                 text = start_pos + length - 4;
567
568                 a ^= fbs(7);
569                 text++;
570
571                 return a ^ nbs(10) ^ nbs(11);
572     }
573 }
574
575 /* utf_full_hashkey ************************************************************
576
577    This function computes a hash value using all bytes in the string.
578
579    The algorithm is the "One-at-a-time" algorithm as published
580    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
581
582 *******************************************************************************/
583
584 u4 utf_full_hashkey(const char *text, u4 length)
585 {
586         register const unsigned char *p = (const unsigned char *) text;
587         register u4 hash;
588         register u4 i;
589
590         hash = 0;
591         for (i=length; i--;)
592         {
593             hash += *p++;
594             hash += (hash << 10);
595             hash ^= (hash >> 6);
596         }
597         hash += (hash << 3);
598         hash ^= (hash >> 11);
599         hash += (hash << 15);
600
601         return hash;
602 }
603
604 /* unicode_hashkey *************************************************************
605
606    Compute the hashkey of a unicode string.
607
608 *******************************************************************************/
609
610 u4 unicode_hashkey(u2 *text, u2 len)
611 {
612         return utf_hashkey((char *) text, len);
613 }
614
615
616 /* utf_new *********************************************************************
617
618    Creates a new utf-symbol, the text of the symbol is passed as a
619    u1-array. The function searches the utf-hashtable for a utf-symbol
620    with this text. On success the element returned, otherwise a new
621    hashtable element is created.
622
623    If the number of entries in the hashtable exceeds twice the size of
624    the hashtable slots a reorganization of the hashtable is done and
625    the utf symbols are copied to a new hashtable with doubled size.
626
627 *******************************************************************************/
628
629 utf *utf_new(const char *text, u2 length)
630 {
631         u4 key;                             /* hashkey computed from utf-text     */
632         u4 slot;                            /* slot in hashtable                  */
633         utf *u;                             /* hashtable element                  */
634         u2 i;
635
636         LOCK_MONITOR_ENTER(hashtable_utf->header);
637
638 #if defined(ENABLE_STATISTICS)
639         if (opt_stat)
640                 count_utf_new++;
641 #endif
642
643         key  = utf_hashkey(text, length);
644         slot = key & (hashtable_utf->size - 1);
645         u    = hashtable_utf->ptr[slot];
646
647         /* search external hash chain for utf-symbol */
648
649         while (u) {
650                 if (u->blength == length) {
651                         /* compare text of hashtable elements */
652
653                         for (i = 0; i < length; i++)
654                                 if (text[i] != u->text[i])
655                                         goto nomatch;
656                         
657 #if defined(ENABLE_STATISTICS)
658                         if (opt_stat)
659                                 count_utf_new_found++;
660 #endif
661
662                         /* symbol found in hashtable */
663
664                         LOCK_MONITOR_EXIT(hashtable_utf->header);
665
666                         return u;
667                 }
668
669         nomatch:
670                 u = u->hashlink; /* next element in external chain */
671         }
672
673         /* location in hashtable found, create new utf element */
674
675         u = NEW(utf);
676
677         u->blength  = length;               /* length in bytes of utfstring       */
678         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
679         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
680
681         memcpy(u->text, text, length);      /* copy utf-text                      */
682         u->text[length] = '\0';
683
684 #if defined(ENABLE_STATISTICS)
685         if (opt_stat)
686                 count_utf_len += sizeof(utf) + length + 1;
687 #endif
688
689         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
690         hashtable_utf->entries++;           /* update number of entries           */
691
692         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
693
694         /* reorganization of hashtable, average length of the external
695            chains is approx. 2 */
696
697                 hashtable *newhash;                              /* the new hashtable */
698                 u4         i;
699                 utf       *u;
700                 utf       *nextu;
701                 u4         slot;
702
703                 /* create new hashtable, double the size */
704
705                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
706
707 #if defined(ENABLE_STATISTICS)
708                 if (opt_stat)
709                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
710 #endif
711
712                 /* transfer elements to new hashtable */
713
714                 for (i = 0; i < hashtable_utf->size; i++) {
715                         u = hashtable_utf->ptr[i];
716
717                         while (u) {
718                                 nextu = u->hashlink;
719                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
720                                                 
721                                 u->hashlink = (utf *) newhash->ptr[slot];
722                                 newhash->ptr[slot] = u;
723
724                                 /* follow link in external hash chain */
725
726                                 u = nextu;
727                         }
728                 }
729         
730                 /* dispose old table */
731
732                 hashtable_free(hashtable_utf);
733
734                 hashtable_utf = newhash;
735         }
736
737         LOCK_MONITOR_EXIT(hashtable_utf->header);
738
739         return u;
740 }
741
742
743 /* utf_new_u2 ******************************************************************
744
745    Make utf symbol from u2 array, if isclassname is true '.' is
746    replaced by '/'.
747
748 *******************************************************************************/
749
750 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
751 {
752         char *buffer;                   /* memory buffer for  unicode characters  */
753         char *pos;                      /* pointer to current position in buffer  */
754         u4 left;                        /* unicode characters left                */
755         u4 buflength;                   /* utf length in bytes of the u2 array    */
756         utf *result;                    /* resulting utf-string                   */
757         int i;          
758
759         /* determine utf length in bytes and allocate memory */
760
761         buflength = u2_utflength(unicode_pos, unicode_length); 
762         buffer    = MNEW(char, buflength);
763  
764         left = buflength;
765         pos  = buffer;
766
767         for (i = 0; i++ < unicode_length; unicode_pos++) {
768                 /* next unicode character */
769                 u2 c = *unicode_pos;
770                 
771                 if ((c != 0) && (c < 0x80)) {
772                         /* 1 character */       
773                         left--;
774                 if ((int) left < 0) break;
775                         /* convert classname */
776                         if (isclassname && c == '.')
777                                 *pos++ = '/';
778                         else
779                                 *pos++ = (char) c;
780
781                 } else if (c < 0x800) {             
782                         /* 2 characters */                              
783                 unsigned char high = c >> 6;
784                 unsigned char low  = c & 0x3F;
785                         left = left - 2;
786                 if ((int) left < 0) break;
787                 *pos++ = high | 0xC0; 
788                 *pos++ = low  | 0x80;     
789
790                 } else {         
791                 /* 3 characters */                              
792                 char low  = c & 0x3f;
793                 char mid  = (c >> 6) & 0x3F;
794                 char high = c >> 12;
795                         left = left - 3;
796                 if ((int) left < 0) break;
797                 *pos++ = high | 0xE0; 
798                 *pos++ = mid  | 0x80;  
799                 *pos++ = low  | 0x80;   
800                 }
801         }
802         
803         /* insert utf-string into symbol-table */
804         result = utf_new(buffer,buflength);
805
806         MFREE(buffer, char, buflength);
807
808         return result;
809 }
810
811
812 /* utf_new_char ****************************************************************
813
814    Creates a new utf symbol, the text for this symbol is passed as a
815    c-string ( = char* ).
816
817 *******************************************************************************/
818
819 utf *utf_new_char(const char *text)
820 {
821         return utf_new(text, strlen(text));
822 }
823
824
825 /* utf_new_char_classname ******************************************************
826
827    Creates a new utf symbol, the text for this symbol is passed as a
828    c-string ( = char* ) "." characters are going to be replaced by
829    "/". Since the above function is used often, this is a separte
830    function, instead of an if.
831
832 *******************************************************************************/
833
834 utf *utf_new_char_classname(const char *text)
835 {
836         if (strchr(text, '.')) {
837                 char *txt = strdup(text);
838                 char *end = txt + strlen(txt);
839                 char *c;
840                 utf *tmpRes;
841
842                 for (c = txt; c < end; c++)
843                         if (*c == '.') *c = '/';
844
845                 tmpRes = utf_new(txt, strlen(txt));
846                 FREE(txt, 0);
847
848                 return tmpRes;
849
850         } else
851                 return utf_new(text, strlen(text));
852 }
853
854
855 /* utf_nextu2 ******************************************************************
856
857    Read the next unicode character from the utf string and increment
858    the utf-string pointer accordingly.
859
860    CAUTION: This function is unsafe for input that was not checked 
861             by is_valid_utf!
862
863 *******************************************************************************/
864
865 u2 utf_nextu2(char **utf_ptr)
866 {
867     /* uncompressed unicode character */
868     u2 unicode_char = 0;
869     /* current position in utf text */  
870     unsigned char *utf = (unsigned char *) (*utf_ptr);
871     /* bytes representing the unicode character */
872     unsigned char ch1, ch2, ch3;
873     /* number of bytes used to represent the unicode character */
874     int len = 0;
875         
876     switch ((ch1 = utf[0]) >> 4) {
877         default: /* 1 byte */
878                 (*utf_ptr)++;
879                 return (u2) ch1;
880         case 0xC: 
881         case 0xD: /* 2 bytes */
882                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
883                         unsigned char high = ch1 & 0x1F;
884                         unsigned char low  = ch2 & 0x3F;
885                         unicode_char = (high << 6) + low;
886                         len = 2;
887                 }
888                 break;
889
890         case 0xE: /* 2 or 3 bytes */
891                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
892                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
893                                 unsigned char low  = ch3 & 0x3f;
894                                 unsigned char mid  = ch2 & 0x3f;
895                                 unsigned char high = ch1 & 0x0f;
896                                 unicode_char = (((high << 6) + mid) << 6) + low;
897                                 len = 3;
898                         } else
899                                 len = 2;                                           
900                 }
901                 break;
902     }
903
904     /* update position in utf-text */
905     *utf_ptr = (char *) (utf + len);
906
907     return unicode_char;
908 }
909
910
911 /* utf_bytes *******************************************************************
912
913    Determine number of bytes (aka. octets) in the utf string.
914
915    IN:
916       u............utf string
917
918    OUT:
919       The number of octets of this utf string.
920           There is _no_ terminating zero included in this count.
921
922 *******************************************************************************/
923
924 u4 utf_bytes(utf *u)
925 {
926         return u->blength;
927 }
928
929
930 /* utf_get_number_of_u2s_for_buffer ********************************************
931
932    Determine number of UTF-16 u2s in the given UTF-8 buffer
933
934    CAUTION: This function is unsafe for input that was not checked 
935             by is_valid_utf!
936
937    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
938    to an array of u2s (UTF-16) and want to know how many of them you will get.
939    All other uses of this function are probably wrong.
940
941    IN:
942       buffer........points to first char in buffer
943           blength.......number of _bytes_ in the buffer
944
945    OUT:
946       the number of u2s needed to hold this string in UTF-16 encoding.
947           There is _no_ terminating zero included in this count.
948
949    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
950    exception.
951
952 *******************************************************************************/
953
954 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
955 {
956         const char *endpos;                 /* points behind utf string           */
957         const char *utf_ptr;                /* current position in utf text       */
958         u4 len = 0;                         /* number of unicode characters       */
959
960         utf_ptr = buffer;
961         endpos = utf_ptr + blength;
962
963         while (utf_ptr < endpos) {
964                 len++;
965                 /* next unicode character */
966                 utf_nextu2((char **)&utf_ptr);
967         }
968
969         assert(utf_ptr == endpos);
970
971         return len;
972 }
973
974
975 /* utf_get_number_of_u2s *******************************************************
976
977    Determine number of UTF-16 u2s in the utf string.
978
979    CAUTION: This function is unsafe for input that was not checked 
980             by is_valid_utf!
981
982    CAUTION: Use this function *only* when you want to convert a utf string
983    to an array of u2s and want to know how many of them you will get.
984    All other uses of this function are probably wrong.
985
986    IN:
987       u............utf string
988
989    OUT:
990       the number of u2s needed to hold this string in UTF-16 encoding.
991           There is _no_ terminating zero included in this count.
992           XXX 0 if a NullPointerException has been thrown (see below)
993
994 *******************************************************************************/
995
996 u4 utf_get_number_of_u2s(utf *u)
997 {
998         char *endpos;                       /* points behind utf string           */
999         char *utf_ptr;                      /* current position in utf text       */
1000         u4 len = 0;                         /* number of unicode characters       */
1001
1002         /* XXX this is probably not checked by most callers! Review this after */
1003         /* the invalid uses of this function have been eliminated */
1004         if (u == NULL) {
1005                 exceptions_throw_nullpointerexception();
1006                 return 0;
1007         }
1008
1009         endpos = UTF_END(u);
1010         utf_ptr = u->text;
1011
1012         while (utf_ptr < endpos) {
1013                 len++;
1014                 /* next unicode character */
1015                 utf_nextu2(&utf_ptr);
1016         }
1017
1018         if (utf_ptr != endpos) {
1019                 /* string ended abruptly */
1020                 exceptions_throw_internalerror("Illegal utf8 string");
1021                 return 0;
1022         }
1023
1024         return len;
1025 }
1026
1027
1028 /* utf8_safe_number_of_u2s *****************************************************
1029
1030    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1031    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1032
1033    This function is safe even for invalid UTF-8 strings.
1034
1035    IN:
1036       text..........zero-terminated(!) UTF-8 string (may be invalid)
1037                         must NOT be NULL
1038           nbytes........strlen(text). (This is needed to completely emulate
1039                         the RI).
1040
1041    OUT:
1042       the number of u2s needed to hold this string in UTF-16 encoding.
1043           There is _no_ terminating zero included in this count.
1044
1045 *******************************************************************************/
1046
1047 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1048         register const unsigned char *t;
1049         register s4 byte;
1050         register s4 len;
1051         register const unsigned char *tlimit;
1052         s4 byte1;
1053         s4 byte2;
1054         s4 byte3;
1055         s4 value;
1056         s4 skip;
1057
1058         assert(text);
1059         assert(nbytes >= 0);
1060
1061         len = 0;
1062         t = (const unsigned char *) text;
1063         tlimit = t + nbytes;
1064
1065         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1066
1067         while (1) {
1068                 byte = *t++;
1069
1070                 if (byte & 0x80) {
1071                         /* highest bit set, non-ASCII character */
1072
1073                         if ((byte & 0xe0) == 0xc0) {
1074                                 /* 2-byte: should be 110..... 10...... ? */
1075
1076                                 if ((*t++ & 0xc0) == 0x80)
1077                                         ; /* valid 2-byte */
1078                                 else
1079                                         t--; /* invalid */
1080                         }
1081                         else if ((byte & 0xf0) == 0xe0) {
1082                                 /* 3-byte: should be 1110.... 10...... 10...... */
1083                                 /*                            ^t                */
1084
1085                                 if (t + 2 > tlimit)
1086                                         return len + 1; /* invalid, stop here */
1087
1088                                 if ((*t++ & 0xc0) == 0x80) {
1089                                         if ((*t++ & 0xc0) == 0x80)
1090                                                 ; /* valid 3-byte */
1091                                         else
1092                                                 t--; /* invalid */
1093                                 }
1094                                 else
1095                                         t--; /* invalid */
1096                         }
1097                         else if ((byte & 0xf8) == 0xf0) {
1098                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1099                                 /*                            ^t                         */
1100
1101                                 if (t + 3 > tlimit)
1102                                         return len + 1; /* invalid, stop here */
1103
1104                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1105                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1106                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1107                                                         /* valid 4-byte UTF-8? */
1108                                                         value = ((byte  & 0x07) << 18)
1109                                                                   | ((byte1 & 0x3f) << 12)
1110                                                                   | ((byte2 & 0x3f) <<  6)
1111                                                                   | ((byte3 & 0x3f)      );
1112
1113                                                         if (value > 0x10FFFF)
1114                                                                 ; /* invalid */
1115                                                         else if (value > 0xFFFF)
1116                                                                 len += 1; /* we need surrogates */
1117                                                         else
1118                                                                 ; /* 16bit suffice */
1119                                                 }
1120                                                 else
1121                                                         t--; /* invalid */
1122                                         }
1123                                         else
1124                                                 t--; /* invalid */
1125                                 }
1126                                 else
1127                                         t--; /* invalid */
1128                         }
1129                         else if ((byte & 0xfc) == 0xf8) {
1130                                 /* invalid 5-byte */
1131                                 if (t + 4 > tlimit)
1132                                         return len + 1; /* invalid, stop here */
1133
1134                                 skip = 4;
1135                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1136                                         t++;
1137                         }
1138                         else if ((byte & 0xfe) == 0xfc) {
1139                                 /* invalid 6-byte */
1140                                 if (t + 5 > tlimit)
1141                                         return len + 1; /* invalid, stop here */
1142
1143                                 skip = 5;
1144                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1145                                         t++;
1146                         }
1147                         else
1148                                 ; /* invalid */
1149                 }
1150                 else {
1151                         /* NUL */
1152
1153                         if (byte == 0)
1154                                 break;
1155
1156                         /* ASCII character, common case */
1157                 }
1158
1159                 len++;
1160         }
1161
1162         return len;
1163 }
1164
1165
1166 /* utf8_safe_convert_to_u2s ****************************************************
1167
1168    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1169    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1170    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1171
1172    This function is safe even for invalid UTF-8 strings.
1173
1174    IN:
1175       text..........zero-terminated(!) UTF-8 string (may be invalid)
1176                         must NOT be NULL
1177           nbytes........strlen(text). (This is needed to completely emulate
1178                                         the RI).
1179           buffer........a preallocated array of u2s to receive the decoded
1180                         string. Use utf8_safe_number_of_u2s to get the
1181                                         required number of u2s for allocating this.
1182
1183 *******************************************************************************/
1184
1185 #define UNICODE_REPLACEMENT  0xfffd
1186
1187 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1188         register const unsigned char *t;
1189         register s4 byte;
1190         register const unsigned char *tlimit;
1191         s4 byte1;
1192         s4 byte2;
1193         s4 byte3;
1194         s4 value;
1195         s4 skip;
1196
1197         assert(text);
1198         assert(nbytes >= 0);
1199
1200         t = (const unsigned char *) text;
1201         tlimit = t + nbytes;
1202
1203         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1204
1205         while (1) {
1206                 byte = *t++;
1207
1208                 if (byte & 0x80) {
1209                         /* highest bit set, non-ASCII character */
1210
1211                         if ((byte & 0xe0) == 0xc0) {
1212                                 /* 2-byte: should be 110..... 10...... */
1213
1214                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1215                                         /* valid 2-byte UTF-8 */
1216                                         *buffer++ = ((byte  & 0x1f) << 6)
1217                                                           | ((byte1 & 0x3f)     );
1218                                 }
1219                                 else {
1220                                         *buffer++ = UNICODE_REPLACEMENT;
1221                                         t--;
1222                                 }
1223                         }
1224                         else if ((byte & 0xf0) == 0xe0) {
1225                                 /* 3-byte: should be 1110.... 10...... 10...... */
1226
1227                                 if (t + 2 > tlimit) {
1228                                         *buffer++ = UNICODE_REPLACEMENT;
1229                                         return;
1230                                 }
1231
1232                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1233                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1234                                                 /* valid 3-byte UTF-8 */
1235                                                 *buffer++ = ((byte  & 0x0f) << 12)
1236                                                                   | ((byte1 & 0x3f) <<  6)
1237                                                                   | ((byte2 & 0x3f)      );
1238                                         }
1239                                         else {
1240                                                 *buffer++ = UNICODE_REPLACEMENT;
1241                                                 t--;
1242                                         }
1243                                 }
1244                                 else {
1245                                         *buffer++ = UNICODE_REPLACEMENT;
1246                                         t--;
1247                                 }
1248                         }
1249                         else if ((byte & 0xf8) == 0xf0) {
1250                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1251
1252                                 if (t + 3 > tlimit) {
1253                                         *buffer++ = UNICODE_REPLACEMENT;
1254                                         return;
1255                                 }
1256
1257                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1258                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1259                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1260                                                         /* valid 4-byte UTF-8? */
1261                                                         value = ((byte  & 0x07) << 18)
1262                                                                   | ((byte1 & 0x3f) << 12)
1263                                                                   | ((byte2 & 0x3f) <<  6)
1264                                                                   | ((byte3 & 0x3f)      );
1265
1266                                                         if (value > 0x10FFFF) {
1267                                                                 *buffer++ = UNICODE_REPLACEMENT;
1268                                                         }
1269                                                         else if (value > 0xFFFF) {
1270                                                                 /* we need surrogates */
1271                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1272                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1273                                                         }
1274                                                         else
1275                                                                 *buffer++ = value; /* 16bit suffice */
1276                                                 }
1277                                                 else {
1278                                                         *buffer++ = UNICODE_REPLACEMENT;
1279                                                         t--;
1280                                                 }
1281                                         }
1282                                         else {
1283                                                 *buffer++ = UNICODE_REPLACEMENT;
1284                                                 t--;
1285                                         }
1286                                 }
1287                                 else {
1288                                         *buffer++ = UNICODE_REPLACEMENT;
1289                                         t--;
1290                                 }
1291                         }
1292                         else if ((byte & 0xfc) == 0xf8) {
1293                                 if (t + 4 > tlimit) {
1294                                         *buffer++ = UNICODE_REPLACEMENT;
1295                                         return;
1296                                 }
1297
1298                                 skip = 4;
1299                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1300                                         t++;
1301                                 *buffer++ = UNICODE_REPLACEMENT;
1302                         }
1303                         else if ((byte & 0xfe) == 0xfc) {
1304                                 if (t + 5 > tlimit) {
1305                                         *buffer++ = UNICODE_REPLACEMENT;
1306                                         return;
1307                                 }
1308
1309                                 skip = 5;
1310                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1311                                         t++;
1312                                 *buffer++ = UNICODE_REPLACEMENT;
1313                         }
1314                         else
1315                                 *buffer++ = UNICODE_REPLACEMENT;
1316                 }
1317                 else {
1318                         /* NUL */
1319
1320                         if (byte == 0)
1321                                 break;
1322
1323                         /* ASCII character, common case */
1324
1325                         *buffer++ = byte;
1326                 }
1327         }
1328 }
1329
1330
1331 /* u2_utflength ****************************************************************
1332
1333    Returns the utf length in bytes of a u2 array.
1334
1335 *******************************************************************************/
1336
1337 u4 u2_utflength(u2 *text, u4 u2_length)
1338 {
1339         u4 result_len = 0;                  /* utf length in bytes                */
1340         u2 ch;                              /* current unicode character          */
1341         u4 len;
1342         
1343         for (len = 0; len < u2_length; len++) {
1344                 /* next unicode character */
1345                 ch = *text++;
1346           
1347                 /* determine bytes required to store unicode character as utf */
1348                 if (ch && (ch < 0x80)) 
1349                         result_len++;
1350                 else if (ch < 0x800)
1351                         result_len += 2;        
1352                 else 
1353                         result_len += 3;        
1354         }
1355
1356     return result_len;
1357 }
1358
1359
1360 /* utf_copy ********************************************************************
1361
1362    Copy the given utf string byte-for-byte to a buffer.
1363
1364    IN:
1365       buffer.......the buffer
1366           u............the utf string
1367
1368 *******************************************************************************/
1369
1370 void utf_copy(char *buffer, utf *u)
1371 {
1372         /* our utf strings are zero-terminated (done by utf_new) */
1373         MCOPY(buffer, u->text, char, u->blength + 1);
1374 }
1375
1376
1377 /* utf_cat *********************************************************************
1378
1379    Append the given utf string byte-for-byte to a buffer.
1380
1381    IN:
1382       buffer.......the buffer
1383           u............the utf string
1384
1385 *******************************************************************************/
1386
1387 void utf_cat(char *buffer, utf *u)
1388 {
1389         /* our utf strings are zero-terminated (done by utf_new) */
1390         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1391 }
1392
1393
1394 /* utf_copy_classname **********************************************************
1395
1396    Copy the given utf classname byte-for-byte to a buffer.
1397    '/' is replaced by '.'
1398
1399    IN:
1400       buffer.......the buffer
1401           u............the utf string
1402
1403 *******************************************************************************/
1404
1405 void utf_copy_classname(char *buffer, utf *u)
1406 {
1407         char *bufptr;
1408         char *srcptr;
1409         char *endptr;
1410         char ch;
1411
1412         bufptr = buffer;
1413         srcptr = u->text;
1414         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1415
1416         while (srcptr != endptr) {
1417                 ch = *srcptr++;
1418                 if (ch == '/')
1419                         ch = '.';
1420                 *bufptr++ = ch;
1421         }
1422 }
1423
1424
1425 /* utf_cat *********************************************************************
1426
1427    Append the given utf classname byte-for-byte to a buffer.
1428    '/' is replaced by '.'
1429
1430    IN:
1431       buffer.......the buffer
1432           u............the utf string
1433
1434 *******************************************************************************/
1435
1436 void utf_cat_classname(char *buffer, utf *u)
1437 {
1438         utf_copy_classname(buffer + strlen(buffer), u);
1439 }
1440
1441 /* utf_display_printable_ascii *************************************************
1442
1443    Write utf symbol to stdout (for debugging purposes).
1444    Non-printable and non-ASCII characters are printed as '?'.
1445
1446 *******************************************************************************/
1447
1448 void utf_display_printable_ascii(utf *u)
1449 {
1450         char *endpos;                       /* points behind utf string           */
1451         char *utf_ptr;                      /* current position in utf text       */
1452
1453         if (u == NULL) {
1454                 printf("NULL");
1455                 fflush(stdout);
1456                 return;
1457         }
1458
1459         endpos = UTF_END(u);
1460         utf_ptr = u->text;
1461
1462         while (utf_ptr < endpos) {
1463                 /* read next unicode character */
1464
1465                 u2 c = utf_nextu2(&utf_ptr);
1466
1467                 if ((c >= 32) && (c <= 127))
1468                         printf("%c", c);
1469                 else
1470                         printf("?");
1471         }
1472
1473         fflush(stdout);
1474 }
1475
1476
1477 /* utf_display_printable_ascii_classname ***************************************
1478
1479    Write utf symbol to stdout with `/' converted to `.' (for debugging
1480    purposes).
1481    Non-printable and non-ASCII characters are printed as '?'.
1482
1483 *******************************************************************************/
1484
1485 void utf_display_printable_ascii_classname(utf *u)
1486 {
1487         char *endpos;                       /* points behind utf string           */
1488         char *utf_ptr;                      /* current position in utf text       */
1489
1490         if (u == NULL) {
1491                 printf("NULL");
1492                 fflush(stdout);
1493                 return;
1494         }
1495
1496         endpos = UTF_END(u);
1497         utf_ptr = u->text;
1498
1499         while (utf_ptr < endpos) {
1500                 /* read next unicode character */
1501
1502                 u2 c = utf_nextu2(&utf_ptr);
1503
1504                 if (c == '/')
1505                         c = '.';
1506
1507                 if ((c >= 32) && (c <= 127))
1508                         printf("%c", c);
1509                 else
1510                         printf("?");
1511         }
1512
1513         fflush(stdout);
1514 }
1515
1516
1517 /* utf_sprint_convert_to_latin1 ************************************************
1518         
1519    Write utf symbol into c-string (for debugging purposes).
1520    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1521    invalid results.
1522
1523 *******************************************************************************/
1524
1525 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1526 {
1527         char *endpos;                       /* points behind utf string           */
1528         char *utf_ptr;                      /* current position in utf text       */
1529         u2 pos = 0;                         /* position in c-string               */
1530
1531         if (!u) {
1532                 strcpy(buffer, "NULL");
1533                 return;
1534         }
1535
1536         endpos = UTF_END(u);
1537         utf_ptr = u->text;
1538
1539         while (utf_ptr < endpos) 
1540                 /* copy next unicode character */       
1541                 buffer[pos++] = utf_nextu2(&utf_ptr);
1542
1543         /* terminate string */
1544         buffer[pos] = '\0';
1545 }
1546
1547
1548 /* utf_sprint_convert_to_latin1_classname **************************************
1549         
1550    Write utf symbol into c-string with `/' converted to `.' (for debugging
1551    purposes).
1552    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1553    invalid results.
1554
1555 *******************************************************************************/
1556
1557 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1558 {
1559         char *endpos;                       /* points behind utf string           */
1560         char *utf_ptr;                      /* current position in utf text       */
1561         u2 pos = 0;                         /* position in c-string               */
1562
1563         if (!u) {
1564                 strcpy(buffer, "NULL");
1565                 return;
1566         }
1567
1568         endpos = UTF_END(u);
1569         utf_ptr = u->text;
1570
1571         while (utf_ptr < endpos) {
1572                 /* copy next unicode character */       
1573                 u2 c = utf_nextu2(&utf_ptr);
1574                 if (c == '/') c = '.';
1575                 buffer[pos++] = c;
1576         }
1577
1578         /* terminate string */
1579         buffer[pos] = '\0';
1580 }
1581
1582
1583 /* utf_strcat_convert_to_latin1 ************************************************
1584         
1585    Like libc strcat, but uses an utf8 string.
1586    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1587    invalid results.
1588
1589 *******************************************************************************/
1590
1591 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1592 {
1593         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1594 }
1595
1596
1597 /* utf_strcat_convert_to_latin1_classname **************************************
1598         
1599    Like libc strcat, but uses an utf8 string.
1600    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1601    invalid results.
1602
1603 *******************************************************************************/
1604
1605 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1606 {
1607         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1608 }
1609
1610
1611 /* utf_fprint_printable_ascii **************************************************
1612         
1613    Write utf symbol into file.
1614    Non-printable and non-ASCII characters are printed as '?'.
1615
1616 *******************************************************************************/
1617
1618 void utf_fprint_printable_ascii(FILE *file, utf *u)
1619 {
1620         char *endpos;                       /* points behind utf string           */
1621         char *utf_ptr;                      /* current position in utf text       */
1622
1623         if (!u)
1624                 return;
1625
1626         endpos = UTF_END(u);
1627         utf_ptr = u->text;
1628
1629         while (utf_ptr < endpos) { 
1630                 /* read next unicode character */                
1631                 u2 c = utf_nextu2(&utf_ptr);                            
1632
1633                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1634                 else fprintf(file, "?");
1635         }
1636 }
1637
1638
1639 /* utf_fprint_printable_ascii_classname ****************************************
1640         
1641    Write utf symbol into file with `/' converted to `.'.
1642    Non-printable and non-ASCII characters are printed as '?'.
1643
1644 *******************************************************************************/
1645
1646 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1647 {
1648         char *endpos;                       /* points behind utf string           */
1649         char *utf_ptr;                      /* current position in utf text       */
1650
1651     if (!u)
1652                 return;
1653
1654         endpos = UTF_END(u);
1655         utf_ptr = u->text;
1656
1657         while (utf_ptr < endpos) { 
1658                 /* read next unicode character */                
1659                 u2 c = utf_nextu2(&utf_ptr);                            
1660                 if (c == '/') c = '.';
1661
1662                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1663                 else fprintf(file, "?");
1664         }
1665 }
1666
1667
1668 /* is_valid_utf ****************************************************************
1669
1670    Return true if the given string is a valid UTF-8 string.
1671
1672    utf_ptr...points to first character
1673    end_pos...points after last character
1674
1675 *******************************************************************************/
1676
1677 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1678
1679 bool is_valid_utf(char *utf_ptr, char *end_pos)
1680 {
1681         int bytes;
1682         int len,i;
1683         char c;
1684         unsigned long v;
1685
1686         if (end_pos < utf_ptr) return false;
1687         bytes = end_pos - utf_ptr;
1688         while (bytes--) {
1689                 c = *utf_ptr++;
1690
1691                 if (!c) return false;                     /* 0x00 is not allowed */
1692                 if ((c & 0x80) == 0) continue;            /* ASCII */
1693
1694                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1695                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1696                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1697                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1698                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1699                 else return false;                        /* invalid leading byte */
1700
1701                 if (len > 2) return false;                /* Java limitation */
1702
1703                 v = (unsigned long)c & (0x3f >> len);
1704                 
1705                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1706
1707                 for (i = len; i--; ) {
1708                         c = *utf_ptr++;
1709                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1710                                 return false;
1711                         v = (v << 6) | (c & 0x3f);
1712                 }
1713
1714                 if (v == 0) {
1715                         if (len != 1) return false;           /* Java special */
1716
1717                 } else {
1718                         /* Sun Java seems to allow overlong UTF-8 encodings */
1719                         
1720                         /* if (v < min_codepoint[len]) */
1721                                 /* XXX throw exception? */
1722                 }
1723
1724                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1725                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1726
1727                 /* even these seem to be allowed */
1728                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1729         }
1730
1731         return true;
1732 }
1733
1734
1735 /* is_valid_name ***************************************************************
1736
1737    Return true if the given string may be used as a class/field/method
1738    name. (Currently this only disallows empty strings and control
1739    characters.)
1740
1741    NOTE: The string is assumed to have passed is_valid_utf!
1742
1743    utf_ptr...points to first character
1744    end_pos...points after last character
1745
1746 *******************************************************************************/
1747
1748 bool is_valid_name(char *utf_ptr, char *end_pos)
1749 {
1750         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1751
1752         while (utf_ptr < end_pos) {
1753                 unsigned char c = *utf_ptr++;
1754
1755                 if (c < 0x20) return false; /* disallow control characters */
1756                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1757                         return false;
1758         }
1759
1760         return true;
1761 }
1762
1763 bool is_valid_name_utf(utf *u)
1764 {
1765         return is_valid_name(u->text, UTF_END(u));
1766 }
1767
1768
1769 /* utf_show ********************************************************************
1770
1771    Writes the utf symbols in the utfhash to stdout and displays the
1772    number of external hash chains grouped according to the chainlength
1773    (for debugging purposes).
1774
1775 *******************************************************************************/
1776
1777 #if !defined(NDEBUG)
1778 void utf_show(void)
1779 {
1780
1781 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1782
1783         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1784         u4 max_chainlength = 0;      /* maximum length of the chains */
1785         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1786         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1787         u4 i;
1788
1789         printf("UTF-HASH:\n");
1790
1791         /* show element of utf-hashtable */
1792
1793         for (i = 0; i < hashtable_utf->size; i++) {
1794                 utf *u = hashtable_utf->ptr[i];
1795
1796                 if (u) {
1797                         printf("SLOT %d: ", (int) i);
1798
1799                         while (u) {
1800                                 printf("'");
1801                                 utf_display_printable_ascii(u);
1802                                 printf("' ");
1803                                 u = u->hashlink;
1804                         }       
1805                         printf("\n");
1806                 }
1807         }
1808
1809         printf("UTF-HASH: %d slots for %d entries\n", 
1810                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1811
1812         if (hashtable_utf->entries == 0)
1813                 return;
1814
1815         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1816
1817         for (i=0;i<CHAIN_LIMIT;i++)
1818                 chain_count[i]=0;
1819
1820         /* count numbers of hashchains according to their length */
1821         for (i=0; i<hashtable_utf->size; i++) {
1822                   
1823                 utf *u = (utf*) hashtable_utf->ptr[i];
1824                 u4 chain_length = 0;
1825
1826                 /* determine chainlength */
1827                 while (u) {
1828                         u = u->hashlink;
1829                         chain_length++;
1830                 }
1831
1832                 /* update sum of all chainlengths */
1833                 sum_chainlength+=chain_length;
1834
1835                 /* determine the maximum length of the chains */
1836                 if (chain_length>max_chainlength)
1837                         max_chainlength = chain_length;
1838
1839                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1840                 if (chain_length>=CHAIN_LIMIT) {
1841                         beyond_limit+=chain_length;
1842                         chain_length=CHAIN_LIMIT-1;
1843                 }
1844
1845                 /* update number of hashchains of current length */
1846                 chain_count[chain_length]++;
1847         }
1848
1849         /* display results */  
1850         for (i=1;i<CHAIN_LIMIT-1;i++) 
1851                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1852           
1853         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1854
1855
1856         printf("max. chainlength:%5d\n",max_chainlength);
1857
1858         /* avg. chainlength = sum of chainlengths / number of chains */
1859         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1860 }
1861 #endif /* !defined(NDEBUG) */
1862
1863
1864 /*
1865  * These are local overrides for various environment variables in Emacs.
1866  * Please do not remove this and leave it at the end of the file, where
1867  * Emacs will automagically detect them.
1868  * ---------------------------------------------------------------------
1869  * Local variables:
1870  * mode: c
1871  * indent-tabs-mode: t
1872  * c-basic-offset: 4
1873  * tab-width: 4
1874  * End:
1875  * vim:noexpandtab:sw=4:ts=4:
1876  */