* src/vm/suck.c (suck_add): Use vm_abort instead of
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28             Mark Probst
29             Andreas Krall
30             Christian Thalinger
31             Edwin Steiner
32
33    $Id: utf8.c 6286 2007-01-10 10:03:38Z twisti $
34
35 */
36
37
38 #include "config.h"
39
40 #include <string.h>
41 #include <assert.h>
42
43 #include "vm/types.h"
44
45 #include "mm/memory.h"
46
47 #if defined(ENABLE_THREADS)
48 # include "threads/native/lock.h"
49 #else
50 # include "threads/none/lock.h"
51 #endif
52
53 #include "vm/builtin.h"
54 #include "vm/exceptions.h"
55 #include "vm/hashtable.h"
56 #include "vm/options.h"
57 #include "vm/statistics.h"
58 #include "vm/stringlocal.h"
59 #include "vm/utf8.h"
60
61
62 /* global variables ***********************************************************/
63
64 /* hashsize must be power of 2 */
65
66 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
67
68 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
69
70
71 /* utf-symbols for pointer comparison of frequently used strings **************/
72
73 utf *utf_java_lang_Object;
74
75 utf *utf_java_lang_Class;
76 utf *utf_java_lang_ClassLoader;
77 utf *utf_java_lang_Cloneable;
78 utf *utf_java_lang_SecurityManager;
79 utf *utf_java_lang_String;
80 utf *utf_java_lang_System;
81 utf *utf_java_lang_ThreadGroup;
82 utf *utf_java_io_Serializable;
83
84 utf *utf_java_lang_Throwable;
85 utf *utf_java_lang_Error;
86 utf *utf_java_lang_LinkageError;
87 utf *utf_java_lang_NoClassDefFoundError;
88 utf *utf_java_lang_OutOfMemoryError;
89 utf *utf_java_lang_VirtualMachineError;
90
91 #if defined(ENABLE_JAVASE)
92 utf *utf_java_lang_AbstractMethodError;
93 utf *utf_java_lang_NoSuchMethodError;
94 #endif
95
96 #if defined(WITH_CLASSPATH_GNU)
97 utf *utf_java_lang_VMThrowable;
98 #endif
99
100 utf *utf_java_lang_Exception;
101 utf *utf_java_lang_ClassCastException;
102 utf *utf_java_lang_ClassNotFoundException;
103 utf *utf_java_lang_IllegalArgumentException;
104 utf *utf_java_lang_IllegalMonitorStateException;
105
106 utf *utf_java_lang_NullPointerException;
107
108 #if defined(ENABLE_JAVASE)
109 utf* utf_java_lang_Void;
110 #endif
111
112 utf* utf_java_lang_Boolean;
113 utf* utf_java_lang_Byte;
114 utf* utf_java_lang_Character;
115 utf* utf_java_lang_Short;
116 utf* utf_java_lang_Integer;
117 utf* utf_java_lang_Long;
118 utf* utf_java_lang_Float;
119 utf* utf_java_lang_Double;
120
121 #if defined(ENABLE_JAVASE)
122 utf *utf_java_lang_StackTraceElement;
123 utf *utf_java_lang_reflect_Constructor;
124 utf *utf_java_lang_reflect_Field;
125 utf *utf_java_lang_reflect_Method;
126 utf *utf_java_util_Vector;
127 #endif
128
129 utf *utf_InnerClasses;                  /* InnerClasses                       */
130 utf *utf_ConstantValue;                 /* ConstantValue                      */
131 utf *utf_Code;                          /* Code                               */
132 utf *utf_Exceptions;                    /* Exceptions                         */
133 utf *utf_LineNumberTable;               /* LineNumberTable                    */
134 utf *utf_SourceFile;                    /* SourceFile                         */
135
136 #if defined(ENABLE_JAVASE)
137 utf *utf_EnclosingMethod;
138 utf *utf_Signature;
139 utf *utf_RuntimeVisibleAnnotations;
140 utf *utf_StackMapTable;
141 #endif
142
143 utf *utf_init;                          /* <init>                             */
144 utf *utf_clinit;                        /* <clinit>                           */
145 utf *utf_clone;                         /* clone                              */
146 utf *utf_finalize;                      /* finalize                           */
147 utf *utf_run;                           /* run                                */
148
149 utf *utf_add;
150 utf *utf_remove;
151 utf *utf_addThread;
152 utf *utf_removeThread;
153 utf *utf_put;
154 utf *utf_get;
155 utf *utf_value;
156
157 utf *utf_fillInStackTrace;
158 utf *utf_getSystemClassLoader;
159 utf *utf_loadClass;
160 utf *utf_printStackTrace;
161
162 utf *utf_Z;                             /* Z                                  */
163 utf *utf_B;                             /* B                                  */
164 utf *utf_C;                             /* C                                  */
165 utf *utf_S;                             /* S                                  */
166 utf *utf_I;                             /* I                                  */
167 utf *utf_J;                             /* J                                  */
168 utf *utf_F;                             /* F                                  */
169 utf *utf_D;                             /* D                                  */
170
171 utf *utf_void__void;                    /* ()V                                */
172 utf *utf_boolean__void;                 /* (Z)V                               */
173 utf *utf_byte__void;                    /* (B)V                               */
174 utf *utf_char__void;                    /* (C)V                               */
175 utf *utf_short__void;                   /* (S)V                               */
176 utf *utf_int__void;                     /* (I)V                               */
177 utf *utf_long__void;                    /* (J)V                               */
178 utf *utf_float__void;                   /* (F)V                               */
179 utf *utf_double__void;                  /* (D)V                               */
180
181 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
182 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
183 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
184 utf *utf_java_lang_Object__java_lang_Object;
185 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
186 utf *utf_java_lang_String__java_lang_Class;
187 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
188 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
189
190 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
191 utf *utf_null;
192 utf *array_packagename;
193
194
195 /* utf_init ********************************************************************
196
197    Initializes the utf8 subsystem.
198
199 *******************************************************************************/
200
201 bool utf8_init(void)
202 {
203         /* create utf8 hashtable */
204
205         hashtable_utf = NEW(hashtable);
206
207         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
208
209 #if defined(ENABLE_STATISTICS)
210         if (opt_stat)
211                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
212 #endif
213
214         /* create utf-symbols for pointer comparison of frequently used strings */
215
216         utf_java_lang_Object           = utf_new_char("java/lang/Object");
217
218         utf_java_lang_Class            = utf_new_char("java/lang/Class");
219         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
220         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
221         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
222         utf_java_lang_String           = utf_new_char("java/lang/String");
223         utf_java_lang_System           = utf_new_char("java/lang/System");
224         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
225         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
226
227         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
228         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
229
230         utf_java_lang_LinkageError =
231                 utf_new_char(string_java_lang_LinkageError);
232
233         utf_java_lang_NoClassDefFoundError =
234                 utf_new_char(string_java_lang_NoClassDefFoundError);
235
236         utf_java_lang_OutOfMemoryError =
237                 utf_new_char(string_java_lang_OutOfMemoryError);
238
239         utf_java_lang_VirtualMachineError =
240                 utf_new_char(string_java_lang_VirtualMachineError);
241
242 #if defined(ENABLE_JAVASE)
243         utf_java_lang_AbstractMethodError =
244                 utf_new_char(string_java_lang_AbstractMethodError);
245
246         utf_java_lang_NoSuchMethodError =
247                 utf_new_char(string_java_lang_NoSuchMethodError);
248 #endif
249
250 #if defined(WITH_CLASSPATH_GNU)
251         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
252 #endif
253
254         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
255
256         utf_java_lang_ClassCastException =
257                 utf_new_char(string_java_lang_ClassCastException);
258
259         utf_java_lang_ClassNotFoundException =
260                 utf_new_char(string_java_lang_ClassNotFoundException);
261
262         utf_java_lang_IllegalArgumentException =
263                 utf_new_char(string_java_lang_IllegalArgumentException);
264
265         utf_java_lang_IllegalMonitorStateException =
266                 utf_new_char(string_java_lang_IllegalMonitorStateException);
267
268         utf_java_lang_NullPointerException =
269                 utf_new_char(string_java_lang_NullPointerException);
270
271 #if defined(ENABLE_JAVASE)
272         utf_java_lang_Void             = utf_new_char("java/lang/Void");
273 #endif
274
275         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
276         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
277         utf_java_lang_Character        = utf_new_char("java/lang/Character");
278         utf_java_lang_Short            = utf_new_char("java/lang/Short");
279         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
280         utf_java_lang_Long             = utf_new_char("java/lang/Long");
281         utf_java_lang_Float            = utf_new_char("java/lang/Float");
282         utf_java_lang_Double           = utf_new_char("java/lang/Double");
283
284 #if defined(ENABLE_JAVASE)
285         utf_java_lang_StackTraceElement =
286                 utf_new_char("java/lang/StackTraceElement");
287
288         utf_java_lang_reflect_Constructor =
289                 utf_new_char("java/lang/reflect/Constructor");
290
291         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
292         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
293         utf_java_util_Vector           = utf_new_char("java/util/Vector");
294 #endif
295
296         utf_InnerClasses               = utf_new_char("InnerClasses");
297         utf_ConstantValue              = utf_new_char("ConstantValue");
298         utf_Code                       = utf_new_char("Code");
299         utf_Exceptions                 = utf_new_char("Exceptions");
300         utf_LineNumberTable            = utf_new_char("LineNumberTable");
301         utf_SourceFile                 = utf_new_char("SourceFile");
302
303 #if defined(ENABLE_JAVASE)
304         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
305         utf_Signature                  = utf_new_char("Signature");
306         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
307         utf_StackMapTable              = utf_new_char("StackMapTable");
308 #endif
309
310         utf_init                           = utf_new_char("<init>");
311         utf_clinit                         = utf_new_char("<clinit>");
312         utf_clone                      = utf_new_char("clone");
313         utf_finalize                   = utf_new_char("finalize");
314         utf_run                        = utf_new_char("run");
315
316         utf_add                        = utf_new_char("add");
317         utf_remove                     = utf_new_char("remove");
318         utf_addThread                  = utf_new_char("addThread");
319         utf_removeThread               = utf_new_char("removeThread");
320         utf_put                        = utf_new_char("put");
321         utf_get                        = utf_new_char("get");
322         utf_value                      = utf_new_char("value");
323
324         utf_printStackTrace            = utf_new_char("printStackTrace");
325         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
326         utf_loadClass                  = utf_new_char("loadClass");
327         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
328
329         utf_Z                          = utf_new_char("Z");
330         utf_B                          = utf_new_char("B");
331         utf_C                          = utf_new_char("C");
332         utf_S                          = utf_new_char("S");
333         utf_I                          = utf_new_char("I");
334         utf_J                          = utf_new_char("J");
335         utf_F                          = utf_new_char("F");
336         utf_D                          = utf_new_char("D");
337
338         utf_void__void                 = utf_new_char("()V");
339         utf_boolean__void              = utf_new_char("(Z)V");
340         utf_byte__void                 = utf_new_char("(B)V");
341         utf_char__void                 = utf_new_char("(C)V");
342         utf_short__void                = utf_new_char("(S)V");
343         utf_int__void                  = utf_new_char("(I)V");
344         utf_long__void                 = utf_new_char("(J)V");
345         utf_float__void                = utf_new_char("(F)V");
346         utf_double__void               = utf_new_char("(D)V");
347         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
348         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
349
350         utf_void__java_lang_ClassLoader =
351                 utf_new_char("()Ljava/lang/ClassLoader;");
352
353         utf_java_lang_Object__java_lang_Object =
354                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
355
356         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
357
358         utf_java_lang_String__java_lang_Class =
359                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
360
361         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
362         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
363
364         utf_null                       = utf_new_char("null");
365         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
366         array_packagename              = utf_new_char("\t<the array package>");
367
368         /* everything's ok */
369
370         return true;
371 }
372
373
374 /* utf_hashkey *****************************************************************
375
376    The hashkey is computed from the utf-text by using up to 8
377    characters.  For utf-symbols longer than 15 characters 3 characters
378    are taken from the beginning and the end, 2 characters are taken
379    from the middle.
380
381 *******************************************************************************/
382
383 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
384 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
385
386 u4 utf_hashkey(const char *text, u4 length)
387 {
388         const char *start_pos = text;       /* pointer to utf text                */
389         u4 a;
390
391         switch (length) {
392         case 0: /* empty string */
393                 return 0;
394
395         case 1: return fbs(0);
396         case 2: return fbs(0) ^ nbs(3);
397         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
398         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
399         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
400         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
401         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
402         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
403
404         case 9:
405                 a = fbs(0);
406                 a ^= nbs(1);
407                 a ^= nbs(2);
408                 text++;
409                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
410
411         case 10:
412                 a = fbs(0);
413                 text++;
414                 a ^= nbs(2);
415                 a ^= nbs(3);
416                 a ^= nbs(4);
417                 text++;
418                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
419
420         case 11:
421                 a = fbs(0);
422                 text++;
423                 a ^= nbs(2);
424                 a ^= nbs(3);
425                 a ^= nbs(4);
426                 text++;
427                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
428
429         case 12:
430                 a = fbs(0);
431                 text += 2;
432                 a ^= nbs(2);
433                 a ^= nbs(3);
434                 text++;
435                 a ^= nbs(5);
436                 a ^= nbs(6);
437                 a ^= nbs(7);
438                 text++;
439                 return a ^ nbs(9) ^ nbs(10);
440
441         case 13:
442                 a = fbs(0);
443                 a ^= nbs(1);
444                 text++;
445                 a ^= nbs(3);
446                 a ^= nbs(4);
447                 text += 2;      
448                 a ^= nbs(7);
449                 a ^= nbs(8);
450                 text += 2;
451                 return a ^ nbs(9) ^ nbs(10);
452
453         case 14:
454                 a = fbs(0);
455                 text += 2;      
456                 a ^= nbs(3);
457                 a ^= nbs(4);
458                 text += 2;      
459                 a ^= nbs(7);
460                 a ^= nbs(8);
461                 text += 2;
462                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
463
464         case 15:
465                 a = fbs(0);
466                 text += 2;      
467                 a ^= nbs(3);
468                 a ^= nbs(4);
469                 text += 2;      
470                 a ^= nbs(7);
471                 a ^= nbs(8);
472                 text += 2;
473                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
474
475         default:  /* 3 characters from beginning */
476                 a = fbs(0);
477                 text += 2;
478                 a ^= nbs(3);
479                 a ^= nbs(4);
480
481                 /* 2 characters from middle */
482                 text = start_pos + (length / 2);
483                 a ^= fbs(5);
484                 text += 2;
485                 a ^= nbs(6);    
486
487                 /* 3 characters from end */
488                 text = start_pos + length - 4;
489
490                 a ^= fbs(7);
491                 text++;
492
493                 return a ^ nbs(10) ^ nbs(11);
494     }
495 }
496
497 /* utf_full_hashkey ************************************************************
498
499    This function computes a hash value using all bytes in the string.
500
501    The algorithm is the "One-at-a-time" algorithm as published
502    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
503
504 *******************************************************************************/
505
506 u4 utf_full_hashkey(const char *text, u4 length)
507 {
508         register const unsigned char *p = (const unsigned char *) text;
509         register u4 hash;
510         register u4 i;
511
512         hash = 0;
513         for (i=length; i--;)
514         {
515             hash += *p++;
516             hash += (hash << 10);
517             hash ^= (hash >> 6);
518         }
519         hash += (hash << 3);
520         hash ^= (hash >> 11);
521         hash += (hash << 15);
522
523         return hash;
524 }
525
526 /* unicode_hashkey *************************************************************
527
528    Compute the hashkey of a unicode string.
529
530 *******************************************************************************/
531
532 u4 unicode_hashkey(u2 *text, u2 len)
533 {
534         return utf_hashkey((char *) text, len);
535 }
536
537
538 /* utf_new *********************************************************************
539
540    Creates a new utf-symbol, the text of the symbol is passed as a
541    u1-array. The function searches the utf-hashtable for a utf-symbol
542    with this text. On success the element returned, otherwise a new
543    hashtable element is created.
544
545    If the number of entries in the hashtable exceeds twice the size of
546    the hashtable slots a reorganization of the hashtable is done and
547    the utf symbols are copied to a new hashtable with doubled size.
548
549 *******************************************************************************/
550
551 utf *utf_new(const char *text, u2 length)
552 {
553         u4 key;                             /* hashkey computed from utf-text     */
554         u4 slot;                            /* slot in hashtable                  */
555         utf *u;                             /* hashtable element                  */
556         u2 i;
557
558         LOCK_MONITOR_ENTER(hashtable_utf->header);
559
560 #if defined(ENABLE_STATISTICS)
561         if (opt_stat)
562                 count_utf_new++;
563 #endif
564
565         key  = utf_hashkey(text, length);
566         slot = key & (hashtable_utf->size - 1);
567         u    = hashtable_utf->ptr[slot];
568
569         /* search external hash chain for utf-symbol */
570
571         while (u) {
572                 if (u->blength == length) {
573                         /* compare text of hashtable elements */
574
575                         for (i = 0; i < length; i++)
576                                 if (text[i] != u->text[i])
577                                         goto nomatch;
578                         
579 #if defined(ENABLE_STATISTICS)
580                         if (opt_stat)
581                                 count_utf_new_found++;
582 #endif
583
584                         /* symbol found in hashtable */
585
586                         LOCK_MONITOR_EXIT(hashtable_utf->header);
587
588                         return u;
589                 }
590
591         nomatch:
592                 u = u->hashlink; /* next element in external chain */
593         }
594
595 #if defined(ENABLE_STATISTICS)
596         if (opt_stat)
597                 count_utf_len += sizeof(utf) + length + 1;
598 #endif
599
600         /* location in hashtable found, create new utf element */
601         u = NEW(utf);
602         u->blength  = length;               /* length in bytes of utfstring       */
603         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
604         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
605
606         memcpy(u->text, text, length);      /* copy utf-text                      */
607         u->text[length] = '\0';
608
609         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
610         hashtable_utf->entries++;           /* update number of entries           */
611
612         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
613
614         /* reorganization of hashtable, average length of the external
615            chains is approx. 2 */
616
617                 hashtable *newhash;                              /* the new hashtable */
618                 u4         i;
619                 utf       *u;
620                 utf       *nextu;
621                 u4         slot;
622
623                 /* create new hashtable, double the size */
624
625                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
626
627 #if defined(ENABLE_STATISTICS)
628                 if (opt_stat)
629                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
630 #endif
631
632                 /* transfer elements to new hashtable */
633
634                 for (i = 0; i < hashtable_utf->size; i++) {
635                         u = hashtable_utf->ptr[i];
636
637                         while (u) {
638                                 nextu = u->hashlink;
639                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
640                                                 
641                                 u->hashlink = (utf *) newhash->ptr[slot];
642                                 newhash->ptr[slot] = u;
643
644                                 /* follow link in external hash chain */
645
646                                 u = nextu;
647                         }
648                 }
649         
650                 /* dispose old table */
651
652                 hashtable_free(hashtable_utf);
653
654                 hashtable_utf = newhash;
655         }
656
657         LOCK_MONITOR_EXIT(hashtable_utf->header);
658
659         return u;
660 }
661
662
663 /* utf_new_u2 ******************************************************************
664
665    Make utf symbol from u2 array, if isclassname is true '.' is
666    replaced by '/'.
667
668 *******************************************************************************/
669
670 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
671 {
672         char *buffer;                   /* memory buffer for  unicode characters  */
673         char *pos;                      /* pointer to current position in buffer  */
674         u4 left;                        /* unicode characters left                */
675         u4 buflength;                   /* utf length in bytes of the u2 array    */
676         utf *result;                    /* resulting utf-string                   */
677         int i;          
678
679         /* determine utf length in bytes and allocate memory */
680
681         buflength = u2_utflength(unicode_pos, unicode_length); 
682         buffer    = MNEW(char, buflength);
683  
684         left = buflength;
685         pos  = buffer;
686
687         for (i = 0; i++ < unicode_length; unicode_pos++) {
688                 /* next unicode character */
689                 u2 c = *unicode_pos;
690                 
691                 if ((c != 0) && (c < 0x80)) {
692                         /* 1 character */       
693                         left--;
694                 if ((int) left < 0) break;
695                         /* convert classname */
696                         if (isclassname && c == '.')
697                                 *pos++ = '/';
698                         else
699                                 *pos++ = (char) c;
700
701                 } else if (c < 0x800) {             
702                         /* 2 characters */                              
703                 unsigned char high = c >> 6;
704                 unsigned char low  = c & 0x3F;
705                         left = left - 2;
706                 if ((int) left < 0) break;
707                 *pos++ = high | 0xC0; 
708                 *pos++ = low  | 0x80;     
709
710                 } else {         
711                 /* 3 characters */                              
712                 char low  = c & 0x3f;
713                 char mid  = (c >> 6) & 0x3F;
714                 char high = c >> 12;
715                         left = left - 3;
716                 if ((int) left < 0) break;
717                 *pos++ = high | 0xE0; 
718                 *pos++ = mid  | 0x80;  
719                 *pos++ = low  | 0x80;   
720                 }
721         }
722         
723         /* insert utf-string into symbol-table */
724         result = utf_new(buffer,buflength);
725
726         MFREE(buffer, char, buflength);
727
728         return result;
729 }
730
731
732 /* utf_new_char ****************************************************************
733
734    Creates a new utf symbol, the text for this symbol is passed as a
735    c-string ( = char* ).
736
737 *******************************************************************************/
738
739 utf *utf_new_char(const char *text)
740 {
741         return utf_new(text, strlen(text));
742 }
743
744
745 /* utf_new_char_classname ******************************************************
746
747    Creates a new utf symbol, the text for this symbol is passed as a
748    c-string ( = char* ) "." characters are going to be replaced by
749    "/". Since the above function is used often, this is a separte
750    function, instead of an if.
751
752 *******************************************************************************/
753
754 utf *utf_new_char_classname(const char *text)
755 {
756         if (strchr(text, '.')) {
757                 char *txt = strdup(text);
758                 char *end = txt + strlen(txt);
759                 char *c;
760                 utf *tmpRes;
761
762                 for (c = txt; c < end; c++)
763                         if (*c == '.') *c = '/';
764
765                 tmpRes = utf_new(txt, strlen(txt));
766                 FREE(txt, 0);
767
768                 return tmpRes;
769
770         } else
771                 return utf_new(text, strlen(text));
772 }
773
774
775 /* utf_nextu2 ******************************************************************
776
777    Read the next unicode character from the utf string and increment
778    the utf-string pointer accordingly.
779
780    CAUTION: This function is unsafe for input that was not checked 
781             by is_valid_utf!
782
783 *******************************************************************************/
784
785 u2 utf_nextu2(char **utf_ptr)
786 {
787     /* uncompressed unicode character */
788     u2 unicode_char = 0;
789     /* current position in utf text */  
790     unsigned char *utf = (unsigned char *) (*utf_ptr);
791     /* bytes representing the unicode character */
792     unsigned char ch1, ch2, ch3;
793     /* number of bytes used to represent the unicode character */
794     int len = 0;
795         
796     switch ((ch1 = utf[0]) >> 4) {
797         default: /* 1 byte */
798                 (*utf_ptr)++;
799                 return (u2) ch1;
800         case 0xC: 
801         case 0xD: /* 2 bytes */
802                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
803                         unsigned char high = ch1 & 0x1F;
804                         unsigned char low  = ch2 & 0x3F;
805                         unicode_char = (high << 6) + low;
806                         len = 2;
807                 }
808                 break;
809
810         case 0xE: /* 2 or 3 bytes */
811                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
812                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
813                                 unsigned char low  = ch3 & 0x3f;
814                                 unsigned char mid  = ch2 & 0x3f;
815                                 unsigned char high = ch1 & 0x0f;
816                                 unicode_char = (((high << 6) + mid) << 6) + low;
817                                 len = 3;
818                         } else
819                                 len = 2;                                           
820                 }
821                 break;
822     }
823
824     /* update position in utf-text */
825     *utf_ptr = (char *) (utf + len);
826
827     return unicode_char;
828 }
829
830
831 /* utf_bytes *******************************************************************
832
833    Determine number of bytes (aka. octets) in the utf string.
834
835    IN:
836       u............utf string
837
838    OUT:
839       The number of octets of this utf string.
840           There is _no_ terminating zero included in this count.
841
842 *******************************************************************************/
843
844 u4 utf_bytes(utf *u)
845 {
846         return u->blength;
847 }
848
849 /* utf_get_number_of_u2s_for_buffer ********************************************
850
851    Determine number of UTF-16 u2s in the given UTF-8 buffer
852
853    CAUTION: This function is unsafe for input that was not checked 
854             by is_valid_utf!
855
856    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
857    to an array of u2s (UTF-16) and want to know how many of them you will get.
858    All other uses of this function are probably wrong.
859
860    IN:
861       buffer........points to first char in buffer
862           blength.......number of _bytes_ in the buffer
863
864    OUT:
865       the number of u2s needed to hold this string in UTF-16 encoding.
866           There is _no_ terminating zero included in this count.
867
868    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
869    exception.
870
871 *******************************************************************************/
872
873 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
874 {
875         const char *endpos;                 /* points behind utf string           */
876         const char *utf_ptr;                /* current position in utf text       */
877         u4 len = 0;                         /* number of unicode characters       */
878
879         utf_ptr = buffer;
880         endpos = utf_ptr + blength;
881
882         while (utf_ptr < endpos) {
883                 len++;
884                 /* next unicode character */
885                 utf_nextu2((char **)&utf_ptr);
886         }
887
888         assert(utf_ptr == endpos);
889
890         return len;
891 }
892
893
894 /* utf_get_number_of_u2s *******************************************************
895
896    Determine number of UTF-16 u2s in the utf string.
897
898    CAUTION: This function is unsafe for input that was not checked 
899             by is_valid_utf!
900
901    CAUTION: Use this function *only* when you want to convert a utf string
902    to an array of u2s and want to know how many of them you will get.
903    All other uses of this function are probably wrong.
904
905    IN:
906       u............utf string
907
908    OUT:
909       the number of u2s needed to hold this string in UTF-16 encoding.
910           There is _no_ terminating zero included in this count.
911           XXX 0 if a NullPointerException has been thrown (see below)
912
913 *******************************************************************************/
914
915 u4 utf_get_number_of_u2s(utf *u)
916 {
917         char *endpos;                       /* points behind utf string           */
918         char *utf_ptr;                      /* current position in utf text       */
919         u4 len = 0;                         /* number of unicode characters       */
920
921         /* XXX this is probably not checked by most callers! Review this after */
922         /* the invalid uses of this function have been eliminated */
923         if (u == NULL) {
924                 exceptions_throw_nullpointerexception();
925                 return 0;
926         }
927
928         endpos = UTF_END(u);
929         utf_ptr = u->text;
930
931         while (utf_ptr < endpos) {
932                 len++;
933                 /* next unicode character */
934                 utf_nextu2(&utf_ptr);
935         }
936
937         if (utf_ptr != endpos) {
938                 /* string ended abruptly */
939                 exceptions_throw_internalerror("Illegal utf8 string");
940                 return NULL;
941         }
942
943         return len;
944 }
945
946
947 /* utf8_safe_number_of_u2s *****************************************************
948
949    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
950    (For invalid UTF-8 the U+fffd replacement character will be counted.)
951
952    This function is safe even for invalid UTF-8 strings.
953
954    IN:
955       text..........zero-terminated(!) UTF-8 string (may be invalid)
956                         must NOT be NULL
957           nbytes........strlen(text). (This is needed to completely emulate
958                         the RI).
959
960    OUT:
961       the number of u2s needed to hold this string in UTF-16 encoding.
962           There is _no_ terminating zero included in this count.
963
964 *******************************************************************************/
965
966 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
967         register const unsigned char *t;
968         register s4 byte;
969         register s4 len;
970         register const unsigned char *tlimit;
971         s4 byte1;
972         s4 byte2;
973         s4 byte3;
974         s4 value;
975         s4 skip;
976
977         assert(text);
978         assert(nbytes >= 0);
979
980         len = 0;
981         t = (const unsigned char *) text;
982         tlimit = t + nbytes;
983
984         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
985
986         while (1) {
987                 byte = *t++;
988
989                 if (byte & 0x80) {
990                         /* highest bit set, non-ASCII character */
991
992                         if ((byte & 0xe0) == 0xc0) {
993                                 /* 2-byte: should be 110..... 10...... ? */
994
995                                 if ((*t++ & 0xc0) == 0x80)
996                                         ; /* valid 2-byte */
997                                 else
998                                         t--; /* invalid */
999                         }
1000                         else if ((byte & 0xf0) == 0xe0) {
1001                                 /* 3-byte: should be 1110.... 10...... 10...... */
1002                                 /*                            ^t                */
1003
1004                                 if (t + 2 > tlimit)
1005                                         return len + 1; /* invalid, stop here */
1006
1007                                 if ((*t++ & 0xc0) == 0x80) {
1008                                         if ((*t++ & 0xc0) == 0x80)
1009                                                 ; /* valid 3-byte */
1010                                         else
1011                                                 t--; /* invalid */
1012                                 }
1013                                 else
1014                                         t--; /* invalid */
1015                         }
1016                         else if ((byte & 0xf8) == 0xf0) {
1017                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1018                                 /*                            ^t                         */
1019
1020                                 if (t + 3 > tlimit)
1021                                         return len + 1; /* invalid, stop here */
1022
1023                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1024                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1025                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1026                                                         /* valid 4-byte UTF-8? */
1027                                                         value = ((byte  & 0x07) << 18)
1028                                                                   | ((byte1 & 0x3f) << 12)
1029                                                                   | ((byte2 & 0x3f) <<  6)
1030                                                                   | ((byte3 & 0x3f)      );
1031
1032                                                         if (value > 0x10FFFF)
1033                                                                 ; /* invalid */
1034                                                         else if (value > 0xFFFF)
1035                                                                 len += 1; /* we need surrogates */
1036                                                         else
1037                                                                 ; /* 16bit suffice */
1038                                                 }
1039                                                 else
1040                                                         t--; /* invalid */
1041                                         }
1042                                         else
1043                                                 t--; /* invalid */
1044                                 }
1045                                 else
1046                                         t--; /* invalid */
1047                         }
1048                         else if ((byte & 0xfc) == 0xf8) {
1049                                 /* invalid 5-byte */
1050                                 if (t + 4 > tlimit)
1051                                         return len + 1; /* invalid, stop here */
1052
1053                                 skip = 4;
1054                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1055                                         t++;
1056                         }
1057                         else if ((byte & 0xfe) == 0xfc) {
1058                                 /* invalid 6-byte */
1059                                 if (t + 5 > tlimit)
1060                                         return len + 1; /* invalid, stop here */
1061
1062                                 skip = 5;
1063                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1064                                         t++;
1065                         }
1066                         else
1067                                 ; /* invalid */
1068                 }
1069                 else {
1070                         /* NUL */
1071
1072                         if (byte == 0)
1073                                 break;
1074
1075                         /* ASCII character, common case */
1076                 }
1077
1078                 len++;
1079         }
1080
1081         return len;
1082 }
1083
1084
1085 /* utf8_safe_convert_to_u2s ****************************************************
1086
1087    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1088    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1089    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1090
1091    This function is safe even for invalid UTF-8 strings.
1092
1093    IN:
1094       text..........zero-terminated(!) UTF-8 string (may be invalid)
1095                         must NOT be NULL
1096           nbytes........strlen(text). (This is needed to completely emulate
1097                                         the RI).
1098           buffer........a preallocated array of u2s to receive the decoded
1099                         string. Use utf8_safe_number_of_u2s to get the
1100                                         required number of u2s for allocating this.
1101
1102 *******************************************************************************/
1103
1104 #define UNICODE_REPLACEMENT  0xfffd
1105
1106 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1107         register const unsigned char *t;
1108         register s4 byte;
1109         register const unsigned char *tlimit;
1110         s4 byte1;
1111         s4 byte2;
1112         s4 byte3;
1113         s4 value;
1114         s4 skip;
1115
1116         assert(text);
1117         assert(nbytes >= 0);
1118
1119         t = (const unsigned char *) text;
1120         tlimit = t + nbytes;
1121
1122         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1123
1124         while (1) {
1125                 byte = *t++;
1126
1127                 if (byte & 0x80) {
1128                         /* highest bit set, non-ASCII character */
1129
1130                         if ((byte & 0xe0) == 0xc0) {
1131                                 /* 2-byte: should be 110..... 10...... */
1132
1133                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1134                                         /* valid 2-byte UTF-8 */
1135                                         *buffer++ = ((byte  & 0x1f) << 6)
1136                                                           | ((byte1 & 0x3f)     );
1137                                 }
1138                                 else {
1139                                         *buffer++ = UNICODE_REPLACEMENT;
1140                                         t--;
1141                                 }
1142                         }
1143                         else if ((byte & 0xf0) == 0xe0) {
1144                                 /* 3-byte: should be 1110.... 10...... 10...... */
1145
1146                                 if (t + 2 > tlimit) {
1147                                         *buffer++ = UNICODE_REPLACEMENT;
1148                                         return;
1149                                 }
1150
1151                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1152                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1153                                                 /* valid 3-byte UTF-8 */
1154                                                 *buffer++ = ((byte  & 0x0f) << 12)
1155                                                                   | ((byte1 & 0x3f) <<  6)
1156                                                                   | ((byte2 & 0x3f)      );
1157                                         }
1158                                         else {
1159                                                 *buffer++ = UNICODE_REPLACEMENT;
1160                                                 t--;
1161                                         }
1162                                 }
1163                                 else {
1164                                         *buffer++ = UNICODE_REPLACEMENT;
1165                                         t--;
1166                                 }
1167                         }
1168                         else if ((byte & 0xf8) == 0xf0) {
1169                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1170
1171                                 if (t + 3 > tlimit) {
1172                                         *buffer++ = UNICODE_REPLACEMENT;
1173                                         return;
1174                                 }
1175
1176                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1177                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1178                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1179                                                         /* valid 4-byte UTF-8? */
1180                                                         value = ((byte  & 0x07) << 18)
1181                                                                   | ((byte1 & 0x3f) << 12)
1182                                                                   | ((byte2 & 0x3f) <<  6)
1183                                                                   | ((byte3 & 0x3f)      );
1184
1185                                                         if (value > 0x10FFFF) {
1186                                                                 *buffer++ = UNICODE_REPLACEMENT;
1187                                                         }
1188                                                         else if (value > 0xFFFF) {
1189                                                                 /* we need surrogates */
1190                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1191                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1192                                                         }
1193                                                         else
1194                                                                 *buffer++ = value; /* 16bit suffice */
1195                                                 }
1196                                                 else {
1197                                                         *buffer++ = UNICODE_REPLACEMENT;
1198                                                         t--;
1199                                                 }
1200                                         }
1201                                         else {
1202                                                 *buffer++ = UNICODE_REPLACEMENT;
1203                                                 t--;
1204                                         }
1205                                 }
1206                                 else {
1207                                         *buffer++ = UNICODE_REPLACEMENT;
1208                                         t--;
1209                                 }
1210                         }
1211                         else if ((byte & 0xfc) == 0xf8) {
1212                                 if (t + 4 > tlimit) {
1213                                         *buffer++ = UNICODE_REPLACEMENT;
1214                                         return;
1215                                 }
1216
1217                                 skip = 4;
1218                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1219                                         t++;
1220                                 *buffer++ = UNICODE_REPLACEMENT;
1221                         }
1222                         else if ((byte & 0xfe) == 0xfc) {
1223                                 if (t + 5 > tlimit) {
1224                                         *buffer++ = UNICODE_REPLACEMENT;
1225                                         return;
1226                                 }
1227
1228                                 skip = 5;
1229                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1230                                         t++;
1231                                 *buffer++ = UNICODE_REPLACEMENT;
1232                         }
1233                         else
1234                                 *buffer++ = UNICODE_REPLACEMENT;
1235                 }
1236                 else {
1237                         /* NUL */
1238
1239                         if (byte == 0)
1240                                 break;
1241
1242                         /* ASCII character, common case */
1243
1244                         *buffer++ = byte;
1245                 }
1246         }
1247 }
1248
1249
1250 /* u2_utflength ****************************************************************
1251
1252    Returns the utf length in bytes of a u2 array.
1253
1254 *******************************************************************************/
1255
1256 u4 u2_utflength(u2 *text, u4 u2_length)
1257 {
1258         u4 result_len = 0;                  /* utf length in bytes                */
1259         u2 ch;                              /* current unicode character          */
1260         u4 len;
1261         
1262         for (len = 0; len < u2_length; len++) {
1263                 /* next unicode character */
1264                 ch = *text++;
1265           
1266                 /* determine bytes required to store unicode character as utf */
1267                 if (ch && (ch < 0x80)) 
1268                         result_len++;
1269                 else if (ch < 0x800)
1270                         result_len += 2;        
1271                 else 
1272                         result_len += 3;        
1273         }
1274
1275     return result_len;
1276 }
1277
1278
1279 /* utf_copy ********************************************************************
1280
1281    Copy the given utf string byte-for-byte to a buffer.
1282
1283    IN:
1284       buffer.......the buffer
1285           u............the utf string
1286
1287 *******************************************************************************/
1288
1289 void utf_copy(char *buffer, utf *u)
1290 {
1291         /* our utf strings are zero-terminated (done by utf_new) */
1292         MCOPY(buffer, u->text, char, u->blength + 1);
1293 }
1294
1295
1296 /* utf_cat *********************************************************************
1297
1298    Append the given utf string byte-for-byte to a buffer.
1299
1300    IN:
1301       buffer.......the buffer
1302           u............the utf string
1303
1304 *******************************************************************************/
1305
1306 void utf_cat(char *buffer, utf *u)
1307 {
1308         /* our utf strings are zero-terminated (done by utf_new) */
1309         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1310 }
1311
1312
1313 /* utf_copy_classname **********************************************************
1314
1315    Copy the given utf classname byte-for-byte to a buffer.
1316    '/' is replaced by '.'
1317
1318    IN:
1319       buffer.......the buffer
1320           u............the utf string
1321
1322 *******************************************************************************/
1323
1324 void utf_copy_classname(char *buffer, utf *u)
1325 {
1326         char *bufptr;
1327         char *srcptr;
1328         char *endptr;
1329         char ch;
1330
1331         bufptr = buffer;
1332         srcptr = u->text;
1333         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1334
1335         while (srcptr != endptr) {
1336                 ch = *srcptr++;
1337                 if (ch == '/')
1338                         ch = '.';
1339                 *bufptr++ = ch;
1340         }
1341 }
1342
1343
1344 /* utf_cat *********************************************************************
1345
1346    Append the given utf classname byte-for-byte to a buffer.
1347    '/' is replaced by '.'
1348
1349    IN:
1350       buffer.......the buffer
1351           u............the utf string
1352
1353 *******************************************************************************/
1354
1355 void utf_cat_classname(char *buffer, utf *u)
1356 {
1357         utf_copy_classname(buffer + strlen(buffer), u);
1358 }
1359
1360 /* utf_display_printable_ascii *************************************************
1361
1362    Write utf symbol to stdout (for debugging purposes).
1363    Non-printable and non-ASCII characters are printed as '?'.
1364
1365 *******************************************************************************/
1366
1367 void utf_display_printable_ascii(utf *u)
1368 {
1369         char *endpos;                       /* points behind utf string           */
1370         char *utf_ptr;                      /* current position in utf text       */
1371
1372         if (u == NULL) {
1373                 printf("NULL");
1374                 fflush(stdout);
1375                 return;
1376         }
1377
1378         endpos = UTF_END(u);
1379         utf_ptr = u->text;
1380
1381         while (utf_ptr < endpos) {
1382                 /* read next unicode character */
1383
1384                 u2 c = utf_nextu2(&utf_ptr);
1385
1386                 if ((c >= 32) && (c <= 127))
1387                         printf("%c", c);
1388                 else
1389                         printf("?");
1390         }
1391
1392         fflush(stdout);
1393 }
1394
1395
1396 /* utf_display_printable_ascii_classname ***************************************
1397
1398    Write utf symbol to stdout with `/' converted to `.' (for debugging
1399    purposes).
1400    Non-printable and non-ASCII characters are printed as '?'.
1401
1402 *******************************************************************************/
1403
1404 void utf_display_printable_ascii_classname(utf *u)
1405 {
1406         char *endpos;                       /* points behind utf string           */
1407         char *utf_ptr;                      /* current position in utf text       */
1408
1409         if (u == NULL) {
1410                 printf("NULL");
1411                 fflush(stdout);
1412                 return;
1413         }
1414
1415         endpos = UTF_END(u);
1416         utf_ptr = u->text;
1417
1418         while (utf_ptr < endpos) {
1419                 /* read next unicode character */
1420
1421                 u2 c = utf_nextu2(&utf_ptr);
1422
1423                 if (c == '/')
1424                         c = '.';
1425
1426                 if ((c >= 32) && (c <= 127))
1427                         printf("%c", c);
1428                 else
1429                         printf("?");
1430         }
1431
1432         fflush(stdout);
1433 }
1434
1435
1436 /* utf_sprint_convert_to_latin1 ************************************************
1437         
1438    Write utf symbol into c-string (for debugging purposes).
1439    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1440    invalid results.
1441
1442 *******************************************************************************/
1443
1444 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1445 {
1446         char *endpos;                       /* points behind utf string           */
1447         char *utf_ptr;                      /* current position in utf text       */
1448         u2 pos = 0;                         /* position in c-string               */
1449
1450         if (!u) {
1451                 strcpy(buffer, "NULL");
1452                 return;
1453         }
1454
1455         endpos = UTF_END(u);
1456         utf_ptr = u->text;
1457
1458         while (utf_ptr < endpos) 
1459                 /* copy next unicode character */       
1460                 buffer[pos++] = utf_nextu2(&utf_ptr);
1461
1462         /* terminate string */
1463         buffer[pos] = '\0';
1464 }
1465
1466
1467 /* utf_sprint_convert_to_latin1_classname **************************************
1468         
1469    Write utf symbol into c-string with `/' converted to `.' (for debugging
1470    purposes).
1471    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1472    invalid results.
1473
1474 *******************************************************************************/
1475
1476 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1477 {
1478         char *endpos;                       /* points behind utf string           */
1479         char *utf_ptr;                      /* current position in utf text       */
1480         u2 pos = 0;                         /* position in c-string               */
1481
1482         if (!u) {
1483                 strcpy(buffer, "NULL");
1484                 return;
1485         }
1486
1487         endpos = UTF_END(u);
1488         utf_ptr = u->text;
1489
1490         while (utf_ptr < endpos) {
1491                 /* copy next unicode character */       
1492                 u2 c = utf_nextu2(&utf_ptr);
1493                 if (c == '/') c = '.';
1494                 buffer[pos++] = c;
1495         }
1496
1497         /* terminate string */
1498         buffer[pos] = '\0';
1499 }
1500
1501
1502 /* utf_strcat_convert_to_latin1 ************************************************
1503         
1504    Like libc strcat, but uses an utf8 string.
1505    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1506    invalid results.
1507
1508 *******************************************************************************/
1509
1510 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1511 {
1512         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1513 }
1514
1515
1516 /* utf_strcat_convert_to_latin1_classname **************************************
1517         
1518    Like libc strcat, but uses an utf8 string.
1519    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1520    invalid results.
1521
1522 *******************************************************************************/
1523
1524 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1525 {
1526         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1527 }
1528
1529
1530 /* utf_fprint_printable_ascii **************************************************
1531         
1532    Write utf symbol into file.
1533    Non-printable and non-ASCII characters are printed as '?'.
1534
1535 *******************************************************************************/
1536
1537 void utf_fprint_printable_ascii(FILE *file, utf *u)
1538 {
1539         char *endpos;                       /* points behind utf string           */
1540         char *utf_ptr;                      /* current position in utf text       */
1541
1542         if (!u)
1543                 return;
1544
1545         endpos = UTF_END(u);
1546         utf_ptr = u->text;
1547
1548         while (utf_ptr < endpos) { 
1549                 /* read next unicode character */                
1550                 u2 c = utf_nextu2(&utf_ptr);                            
1551
1552                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1553                 else fprintf(file, "?");
1554         }
1555 }
1556
1557
1558 /* utf_fprint_printable_ascii_classname ****************************************
1559         
1560    Write utf symbol into file with `/' converted to `.'.
1561    Non-printable and non-ASCII characters are printed as '?'.
1562
1563 *******************************************************************************/
1564
1565 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1566 {
1567         char *endpos;                       /* points behind utf string           */
1568         char *utf_ptr;                      /* current position in utf text       */
1569
1570     if (!u)
1571                 return;
1572
1573         endpos = UTF_END(u);
1574         utf_ptr = u->text;
1575
1576         while (utf_ptr < endpos) { 
1577                 /* read next unicode character */                
1578                 u2 c = utf_nextu2(&utf_ptr);                            
1579                 if (c == '/') c = '.';
1580
1581                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1582                 else fprintf(file, "?");
1583         }
1584 }
1585
1586
1587 /* is_valid_utf ****************************************************************
1588
1589    Return true if the given string is a valid UTF-8 string.
1590
1591    utf_ptr...points to first character
1592    end_pos...points after last character
1593
1594 *******************************************************************************/
1595
1596 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1597
1598 bool is_valid_utf(char *utf_ptr, char *end_pos)
1599 {
1600         int bytes;
1601         int len,i;
1602         char c;
1603         unsigned long v;
1604
1605         if (end_pos < utf_ptr) return false;
1606         bytes = end_pos - utf_ptr;
1607         while (bytes--) {
1608                 c = *utf_ptr++;
1609
1610                 if (!c) return false;                     /* 0x00 is not allowed */
1611                 if ((c & 0x80) == 0) continue;            /* ASCII */
1612
1613                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1614                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1615                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1616                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1617                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1618                 else return false;                        /* invalid leading byte */
1619
1620                 if (len > 2) return false;                /* Java limitation */
1621
1622                 v = (unsigned long)c & (0x3f >> len);
1623                 
1624                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1625
1626                 for (i = len; i--; ) {
1627                         c = *utf_ptr++;
1628                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1629                                 return false;
1630                         v = (v << 6) | (c & 0x3f);
1631                 }
1632
1633                 if (v == 0) {
1634                         if (len != 1) return false;           /* Java special */
1635
1636                 } else {
1637                         /* Sun Java seems to allow overlong UTF-8 encodings */
1638                         
1639                         /* if (v < min_codepoint[len]) */
1640                                 /* XXX throw exception? */
1641                 }
1642
1643                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1644                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1645
1646                 /* even these seem to be allowed */
1647                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1648         }
1649
1650         return true;
1651 }
1652
1653
1654 /* is_valid_name ***************************************************************
1655
1656    Return true if the given string may be used as a class/field/method
1657    name. (Currently this only disallows empty strings and control
1658    characters.)
1659
1660    NOTE: The string is assumed to have passed is_valid_utf!
1661
1662    utf_ptr...points to first character
1663    end_pos...points after last character
1664
1665 *******************************************************************************/
1666
1667 bool is_valid_name(char *utf_ptr, char *end_pos)
1668 {
1669         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1670
1671         while (utf_ptr < end_pos) {
1672                 unsigned char c = *utf_ptr++;
1673
1674                 if (c < 0x20) return false; /* disallow control characters */
1675                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1676                         return false;
1677         }
1678
1679         return true;
1680 }
1681
1682 bool is_valid_name_utf(utf *u)
1683 {
1684         return is_valid_name(u->text, UTF_END(u));
1685 }
1686
1687
1688 /* utf_show ********************************************************************
1689
1690    Writes the utf symbols in the utfhash to stdout and displays the
1691    number of external hash chains grouped according to the chainlength
1692    (for debugging purposes).
1693
1694 *******************************************************************************/
1695
1696 #if !defined(NDEBUG)
1697 void utf_show(void)
1698 {
1699
1700 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1701
1702         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1703         u4 max_chainlength = 0;      /* maximum length of the chains */
1704         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1705         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1706         u4 i;
1707
1708         printf("UTF-HASH:\n");
1709
1710         /* show element of utf-hashtable */
1711
1712         for (i = 0; i < hashtable_utf->size; i++) {
1713                 utf *u = hashtable_utf->ptr[i];
1714
1715                 if (u) {
1716                         printf("SLOT %d: ", (int) i);
1717
1718                         while (u) {
1719                                 printf("'");
1720                                 utf_display_printable_ascii(u);
1721                                 printf("' ");
1722                                 u = u->hashlink;
1723                         }       
1724                         printf("\n");
1725                 }
1726         }
1727
1728         printf("UTF-HASH: %d slots for %d entries\n", 
1729                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1730
1731         if (hashtable_utf->entries == 0)
1732                 return;
1733
1734         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1735
1736         for (i=0;i<CHAIN_LIMIT;i++)
1737                 chain_count[i]=0;
1738
1739         /* count numbers of hashchains according to their length */
1740         for (i=0; i<hashtable_utf->size; i++) {
1741                   
1742                 utf *u = (utf*) hashtable_utf->ptr[i];
1743                 u4 chain_length = 0;
1744
1745                 /* determine chainlength */
1746                 while (u) {
1747                         u = u->hashlink;
1748                         chain_length++;
1749                 }
1750
1751                 /* update sum of all chainlengths */
1752                 sum_chainlength+=chain_length;
1753
1754                 /* determine the maximum length of the chains */
1755                 if (chain_length>max_chainlength)
1756                         max_chainlength = chain_length;
1757
1758                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1759                 if (chain_length>=CHAIN_LIMIT) {
1760                         beyond_limit+=chain_length;
1761                         chain_length=CHAIN_LIMIT-1;
1762                 }
1763
1764                 /* update number of hashchains of current length */
1765                 chain_count[chain_length]++;
1766         }
1767
1768         /* display results */  
1769         for (i=1;i<CHAIN_LIMIT-1;i++) 
1770                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1771           
1772         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1773
1774
1775         printf("max. chainlength:%5d\n",max_chainlength);
1776
1777         /* avg. chainlength = sum of chainlengths / number of chains */
1778         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1779 }
1780 #endif /* !defined(NDEBUG) */
1781
1782
1783 /*
1784  * These are local overrides for various environment variables in Emacs.
1785  * Please do not remove this and leave it at the end of the file, where
1786  * Emacs will automagically detect them.
1787  * ---------------------------------------------------------------------
1788  * Local variables:
1789  * mode: c
1790  * indent-tabs-mode: t
1791  * c-basic-offset: 4
1792  * tab-width: 4
1793  * End:
1794  * vim:noexpandtab:sw=4:ts=4:
1795  */