* src/vm/utf8.c (utf_display): Some reformatting.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 4500 2006-02-13 10:53:49Z twisti $
34
35 */
36
37
38 #include <string.h>
39 #include <assert.h>
40
41 #include "config.h"
42 #include "vm/types.h"
43
44 #include "mm/memory.h"
45
46 #if defined(USE_THREADS)
47 # if defined(NATIVE_THREADS)
48 #  include "threads/native/threads.h"
49 # else
50 #  include "threads/green/threads.h"
51 # endif
52 #endif
53
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
60 #include "vm/utf8.h"
61
62 /* global variables ***********************************************************/
63
64 /* hashsize must be power of 2 */
65
66 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
67
68 hashtable hashtable_utf;                /* hashtable for utf8-symbols         */
69
70 #if defined(USE_THREADS)
71 static java_objectheader *lock_hashtable_utf;
72 #endif
73
74
75 /* utf-symbols for pointer comparison of frequently used strings **************/
76
77 utf *utf_java_lang_Object;              /* java/lang/Object                   */
78
79 utf *utf_java_lang_Class;
80 utf *utf_java_lang_ClassLoader;
81 utf *utf_java_lang_Cloneable;
82 utf *utf_java_lang_SecurityManager;
83 utf *utf_java_lang_String;
84 utf *utf_java_lang_System;
85 utf *utf_java_lang_ThreadGroup;
86 utf *utf_java_io_Serializable;
87
88 utf *utf_java_lang_Throwable;
89 utf *utf_java_lang_VMThrowable;
90 utf *utf_java_lang_Error;
91 utf *utf_java_lang_NoClassDefFoundError;
92 utf *utf_java_lang_LinkageError;
93 utf *utf_java_lang_NoSuchMethodError;
94 utf *utf_java_lang_OutOfMemoryError;
95
96 utf *utf_java_lang_Exception;
97 utf *utf_java_lang_ClassNotFoundException;
98 utf *utf_java_lang_IllegalArgumentException;
99 utf *utf_java_lang_IllegalMonitorStateException;
100
101 utf *utf_java_lang_NullPointerException;
102
103 utf* utf_java_lang_Void;
104 utf* utf_java_lang_Boolean;
105 utf* utf_java_lang_Byte;
106 utf* utf_java_lang_Character;
107 utf* utf_java_lang_Short;
108 utf* utf_java_lang_Integer;
109 utf* utf_java_lang_Long;
110 utf* utf_java_lang_Float;
111 utf* utf_java_lang_Double;
112
113 utf *utf_java_lang_StackTraceElement;
114 utf *utf_java_lang_reflect_Constructor;
115 utf *utf_java_lang_reflect_Field;
116 utf *utf_java_lang_reflect_Method;
117 utf *utf_java_util_Vector;
118
119 utf *utf_InnerClasses;                  /* InnerClasses                       */
120 utf *utf_ConstantValue;                 /* ConstantValue                      */
121 utf *utf_Code;                          /* Code                               */
122 utf *utf_Exceptions;                    /* Exceptions                         */
123 utf *utf_LineNumberTable;               /* LineNumberTable                    */
124 utf *utf_SourceFile;                    /* SourceFile                         */
125
126 utf *utf_init;                          /* <init>                             */
127 utf *utf_clinit;                        /* <clinit>                           */
128 utf *utf_clone;                         /* clone                              */
129 utf *utf_finalize;                      /* finalize                           */
130 utf *utf_run;                           /* run                                */
131
132 utf *utf_add;                           /* add                                */
133 utf *utf_remove;                        /* remove                             */
134 utf *utf_put;                           /* put                                */
135 utf *utf_get;                           /* get                                */
136 utf *utf_value;                         /* value                              */
137
138 utf *utf_fillInStackTrace;
139 utf *utf_getSystemClassLoader;
140 utf *utf_loadClass;
141 utf *utf_printStackTrace;
142
143 utf *utf_Z;                             /* Z                                  */
144 utf *utf_B;                             /* B                                  */
145 utf *utf_C;                             /* C                                  */
146 utf *utf_S;                             /* S                                  */
147 utf *utf_I;                             /* I                                  */
148 utf *utf_J;                             /* J                                  */
149 utf *utf_F;                             /* F                                  */
150 utf *utf_D;                             /* D                                  */
151
152 utf *utf_void__void;                    /* ()V                                */
153 utf *utf_boolean__void;                 /* (Z)V                               */
154 utf *utf_byte__void;                    /* (B)V                               */
155 utf *utf_char__void;                    /* (C)V                               */
156 utf *utf_short__void;                   /* (S)V                               */
157 utf *utf_int__void;                     /* (I)V                               */
158 utf *utf_long__void;                    /* (J)V                               */
159 utf *utf_float__void;                   /* (F)V                               */
160 utf *utf_double__void;                  /* (D)V                               */
161
162 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
163 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
164 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
165 utf *utf_java_lang_Object__java_lang_Object;
166 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
167 utf *utf_java_lang_String__java_lang_Class;
168 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
169
170 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
171
172 utf *array_packagename;
173
174
175 /* utf_init ********************************************************************
176
177    Initializes the utf8 subsystem.
178
179 *******************************************************************************/
180
181 bool utf8_init(void)
182 {
183         /* create utf8 hashtable */
184
185         hashtable_create(&hashtable_utf, HASHTABLE_UTF_SIZE);
186
187 #if defined(ENABLE_STATISTICS)
188         if (opt_stat)
189                 count_utf_len += sizeof(utf*) * hashtable_utf.size;
190 #endif
191
192 #if defined(USE_THREADS)
193         /* create utf hashtable lock object */
194
195         lock_hashtable_utf = NEW(java_objectheader);
196
197 # if defined(NATIVE_THREADS)
198         initObjectLock(lock_hashtable_utf);
199 # endif
200 #endif
201
202         /* create utf-symbols for pointer comparison of frequently used strings */
203
204         utf_java_lang_Object           = utf_new_char("java/lang/Object");
205
206         utf_java_lang_Class            = utf_new_char("java/lang/Class");
207         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
208         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
209         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
210         utf_java_lang_String           = utf_new_char("java/lang/String");
211         utf_java_lang_System           = utf_new_char("java/lang/System");
212         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
213         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
214
215         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
216         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
217         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
218
219         utf_java_lang_NoClassDefFoundError =
220                 utf_new_char(string_java_lang_NoClassDefFoundError);
221
222         utf_java_lang_LinkageError =
223                 utf_new_char(string_java_lang_LinkageError);
224
225         utf_java_lang_NoSuchMethodError =
226                 utf_new_char(string_java_lang_NoSuchMethodError);
227
228         utf_java_lang_OutOfMemoryError =
229                 utf_new_char(string_java_lang_OutOfMemoryError);
230
231         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
232
233         utf_java_lang_ClassNotFoundException =
234                 utf_new_char(string_java_lang_ClassNotFoundException);
235
236         utf_java_lang_IllegalArgumentException =
237                 utf_new_char(string_java_lang_IllegalArgumentException);
238
239         utf_java_lang_IllegalMonitorStateException =
240                 utf_new_char(string_java_lang_IllegalMonitorStateException);
241
242         utf_java_lang_NullPointerException =
243                 utf_new_char(string_java_lang_NullPointerException);
244
245         utf_java_lang_Void             = utf_new_char("java/lang/Void");
246         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
247         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
248         utf_java_lang_Character        = utf_new_char("java/lang/Character");
249         utf_java_lang_Short            = utf_new_char("java/lang/Short");
250         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
251         utf_java_lang_Long             = utf_new_char("java/lang/Long");
252         utf_java_lang_Float            = utf_new_char("java/lang/Float");
253         utf_java_lang_Double           = utf_new_char("java/lang/Double");
254
255         utf_java_lang_StackTraceElement =
256                 utf_new_char("java/lang/StackTraceElement");
257
258         utf_java_lang_reflect_Constructor =
259                 utf_new_char("java/lang/reflect/Constructor");
260
261         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
262         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
263         utf_java_util_Vector           = utf_new_char("java/util/Vector");
264
265         utf_InnerClasses               = utf_new_char("InnerClasses");
266         utf_ConstantValue              = utf_new_char("ConstantValue");
267         utf_Code                       = utf_new_char("Code");
268         utf_Exceptions                 = utf_new_char("Exceptions");
269         utf_LineNumberTable            = utf_new_char("LineNumberTable");
270         utf_SourceFile                 = utf_new_char("SourceFile");
271
272         utf_init                           = utf_new_char("<init>");
273         utf_clinit                         = utf_new_char("<clinit>");
274         utf_clone                      = utf_new_char("clone");
275         utf_finalize                   = utf_new_char("finalize");
276         utf_run                        = utf_new_char("run");
277
278         utf_add                        = utf_new_char("add");
279         utf_remove                     = utf_new_char("remove");
280         utf_put                        = utf_new_char("put");
281         utf_get                        = utf_new_char("get");
282         utf_value                      = utf_new_char("value");
283
284         utf_printStackTrace            = utf_new_char("printStackTrace");
285         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
286         utf_loadClass                  = utf_new_char("loadClass");
287         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
288
289         utf_Z                          = utf_new_char("Z");
290         utf_B                          = utf_new_char("B");
291         utf_C                          = utf_new_char("C");
292         utf_S                          = utf_new_char("S");
293         utf_I                          = utf_new_char("I");
294         utf_J                          = utf_new_char("J");
295         utf_F                          = utf_new_char("F");
296         utf_D                          = utf_new_char("D");
297
298         utf_void__void                 = utf_new_char("()V");
299         utf_boolean__void              = utf_new_char("(Z)V");
300         utf_byte__void                 = utf_new_char("(B)V");
301         utf_char__void                 = utf_new_char("(C)V");
302         utf_short__void                = utf_new_char("(S)V");
303         utf_int__void                  = utf_new_char("(I)V");
304         utf_long__void                 = utf_new_char("(J)V");
305         utf_float__void                = utf_new_char("(F)V");
306         utf_double__void               = utf_new_char("(D)V");
307         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
308         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
309
310         utf_void__java_lang_ClassLoader =
311                 utf_new_char("()Ljava/lang/ClassLoader;");
312
313         utf_java_lang_Object__java_lang_Object =
314                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
315
316         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
317
318         utf_java_lang_String__java_lang_Class =
319                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
320
321         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
322
323         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
324
325         array_packagename              = utf_new_char("\t<the array package>");
326
327         /* everything's ok */
328
329         return true;
330 }
331
332
333 /* utf_hashkey *****************************************************************
334
335    The hashkey is computed from the utf-text by using up to 8
336    characters.  For utf-symbols longer than 15 characters 3 characters
337    are taken from the beginning and the end, 2 characters are taken
338    from the middle.
339
340 *******************************************************************************/
341
342 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
343 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
344
345 u4 utf_hashkey(const char *text, u4 length)
346 {
347         const char *start_pos = text;       /* pointer to utf text                */
348         u4 a;
349
350         switch (length) {
351         case 0: /* empty string */
352                 return 0;
353
354         case 1: return fbs(0);
355         case 2: return fbs(0) ^ nbs(3);
356         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
357         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
358         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
359         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
360         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
361         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
362
363         case 9:
364                 a = fbs(0);
365                 a ^= nbs(1);
366                 a ^= nbs(2);
367                 text++;
368                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
369
370         case 10:
371                 a = fbs(0);
372                 text++;
373                 a ^= nbs(2);
374                 a ^= nbs(3);
375                 a ^= nbs(4);
376                 text++;
377                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
378
379         case 11:
380                 a = fbs(0);
381                 text++;
382                 a ^= nbs(2);
383                 a ^= nbs(3);
384                 a ^= nbs(4);
385                 text++;
386                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
387
388         case 12:
389                 a = fbs(0);
390                 text += 2;
391                 a ^= nbs(2);
392                 a ^= nbs(3);
393                 text++;
394                 a ^= nbs(5);
395                 a ^= nbs(6);
396                 a ^= nbs(7);
397                 text++;
398                 return a ^ nbs(9) ^ nbs(10);
399
400         case 13:
401                 a = fbs(0);
402                 a ^= nbs(1);
403                 text++;
404                 a ^= nbs(3);
405                 a ^= nbs(4);
406                 text += 2;      
407                 a ^= nbs(7);
408                 a ^= nbs(8);
409                 text += 2;
410                 return a ^ nbs(9) ^ nbs(10);
411
412         case 14:
413                 a = fbs(0);
414                 text += 2;      
415                 a ^= nbs(3);
416                 a ^= nbs(4);
417                 text += 2;      
418                 a ^= nbs(7);
419                 a ^= nbs(8);
420                 text += 2;
421                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
422
423         case 15:
424                 a = fbs(0);
425                 text += 2;      
426                 a ^= nbs(3);
427                 a ^= nbs(4);
428                 text += 2;      
429                 a ^= nbs(7);
430                 a ^= nbs(8);
431                 text += 2;
432                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
433
434         default:  /* 3 characters from beginning */
435                 a = fbs(0);
436                 text += 2;
437                 a ^= nbs(3);
438                 a ^= nbs(4);
439
440                 /* 2 characters from middle */
441                 text = start_pos + (length / 2);
442                 a ^= fbs(5);
443                 text += 2;
444                 a ^= nbs(6);    
445
446                 /* 3 characters from end */
447                 text = start_pos + length - 4;
448
449                 a ^= fbs(7);
450                 text++;
451
452                 return a ^ nbs(10) ^ nbs(11);
453     }
454 }
455
456
457 /* utf_hashkey *****************************************************************
458
459    Compute the hashkey of a unicode string.
460
461 *******************************************************************************/
462
463 u4 unicode_hashkey(u2 *text, u2 len)
464 {
465         return utf_hashkey((char *) text, len);
466 }
467
468
469 /* utf_new *********************************************************************
470
471    Creates a new utf-symbol, the text of the symbol is passed as a
472    u1-array. The function searches the utf-hashtable for a utf-symbol
473    with this text. On success the element returned, otherwise a new
474    hashtable element is created.
475
476    If the number of entries in the hashtable exceeds twice the size of
477    the hashtable slots a reorganization of the hashtable is done and
478    the utf symbols are copied to a new hashtable with doubled size.
479
480 *******************************************************************************/
481
482 utf *utf_new(const char *text, u2 length)
483 {
484         u4 key;                             /* hashkey computed from utf-text     */
485         u4 slot;                            /* slot in hashtable                  */
486         utf *u;                             /* hashtable element                  */
487         u2 i;
488
489 #if defined(USE_THREADS)
490         builtin_monitorenter(lock_hashtable_utf);
491 #endif
492
493 #if defined(ENABLE_STATISTICS)
494         if (opt_stat)
495                 count_utf_new++;
496 #endif
497
498         key  = utf_hashkey(text, length);
499         slot = key & (hashtable_utf.size - 1);
500         u    = hashtable_utf.ptr[slot];
501
502         /* search external hash chain for utf-symbol */
503
504         while (u) {
505                 if (u->blength == length) {
506                         /* compare text of hashtable elements */
507
508                         for (i = 0; i < length; i++)
509                                 if (text[i] != u->text[i])
510                                         goto nomatch;
511                         
512 #if defined(ENABLE_STATISTICS)
513                         if (opt_stat)
514                                 count_utf_new_found++;
515 #endif
516
517                         /* symbol found in hashtable */
518
519 #if defined(USE_THREADS)
520                         builtin_monitorexit(lock_hashtable_utf);
521 #endif
522
523                         return u;
524                 }
525
526         nomatch:
527                 u = u->hashlink; /* next element in external chain */
528         }
529
530 #if defined(ENABLE_STATISTICS)
531         if (opt_stat)
532                 count_utf_len += sizeof(utf) + length + 1;
533 #endif
534
535         /* location in hashtable found, create new utf element */
536         u = NEW(utf);
537         u->blength  = length;               /* length in bytes of utfstring       */
538         u->hashlink = hashtable_utf.ptr[slot]; /* link in external hashchain      */
539         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
540
541         memcpy(u->text, text, length);      /* copy utf-text                      */
542         u->text[length] = '\0';
543
544         hashtable_utf.ptr[slot] = u;        /* insert symbol into table           */
545         hashtable_utf.entries++;            /* update number of entries           */
546
547         if (hashtable_utf.entries > (hashtable_utf.size * 2)) {
548
549         /* reorganization of hashtable, average length of the external
550            chains is approx. 2 */
551
552                 hashtable  newhash;                              /* the new hashtable */
553                 u4         i;
554                 utf       *u;
555                 utf       *nextu;
556                 u4         slot;
557
558                 /* create new hashtable, double the size */
559
560                 hashtable_create(&newhash, hashtable_utf.size * 2);
561                 newhash.entries = hashtable_utf.entries;
562
563 #if defined(ENABLE_STATISTICS)
564                 if (opt_stat)
565                         count_utf_len += sizeof(utf*) * hashtable_utf.size;
566 #endif
567
568                 /* transfer elements to new hashtable */
569
570                 for (i = 0; i < hashtable_utf.size; i++) {
571                         u = hashtable_utf.ptr[i];
572
573                         while (u) {
574                                 nextu = u->hashlink;
575                                 slot  = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
576                                                 
577                                 u->hashlink = (utf *) newhash.ptr[slot];
578                                 newhash.ptr[slot] = u;
579
580                                 /* follow link in external hash chain */
581
582                                 u = nextu;
583                         }
584                 }
585         
586                 /* dispose old table */
587
588                 MFREE(hashtable_utf.ptr, void*, hashtable_utf.size);
589                 hashtable_utf = newhash;
590         }
591
592 #if defined(USE_THREADS)
593         builtin_monitorexit(lock_hashtable_utf);
594 #endif
595
596         return u;
597 }
598
599
600 /* utf_new_u2 ******************************************************************
601
602    Make utf symbol from u2 array, if isclassname is true '.' is
603    replaced by '/'.
604
605 *******************************************************************************/
606
607 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
608 {
609         char *buffer;                   /* memory buffer for  unicode characters  */
610         char *pos;                      /* pointer to current position in buffer  */
611         u4 left;                        /* unicode characters left                */
612         u4 buflength;                   /* utf length in bytes of the u2 array    */
613         utf *result;                    /* resulting utf-string                   */
614         int i;          
615
616         /* determine utf length in bytes and allocate memory */
617
618         buflength = u2_utflength(unicode_pos, unicode_length); 
619         buffer    = MNEW(char, buflength);
620  
621         left = buflength;
622         pos  = buffer;
623
624         for (i = 0; i++ < unicode_length; unicode_pos++) {
625                 /* next unicode character */
626                 u2 c = *unicode_pos;
627                 
628                 if ((c != 0) && (c < 0x80)) {
629                         /* 1 character */       
630                         left--;
631                 if ((int) left < 0) break;
632                         /* convert classname */
633                         if (isclassname && c == '.')
634                                 *pos++ = '/';
635                         else
636                                 *pos++ = (char) c;
637
638                 } else if (c < 0x800) {             
639                         /* 2 characters */                              
640                 unsigned char high = c >> 6;
641                 unsigned char low  = c & 0x3F;
642                         left = left - 2;
643                 if ((int) left < 0) break;
644                 *pos++ = high | 0xC0; 
645                 *pos++ = low  | 0x80;     
646
647                 } else {         
648                 /* 3 characters */                              
649                 char low  = c & 0x3f;
650                 char mid  = (c >> 6) & 0x3F;
651                 char high = c >> 12;
652                         left = left - 3;
653                 if ((int) left < 0) break;
654                 *pos++ = high | 0xE0; 
655                 *pos++ = mid  | 0x80;  
656                 *pos++ = low  | 0x80;   
657                 }
658         }
659         
660         /* insert utf-string into symbol-table */
661         result = utf_new(buffer,buflength);
662
663         MFREE(buffer, char, buflength);
664
665         return result;
666 }
667
668
669 /* utf_new_char ****************************************************************
670
671    Creates a new utf symbol, the text for this symbol is passed as a
672    c-string ( = char* ).
673
674 *******************************************************************************/
675
676 utf *utf_new_char(const char *text)
677 {
678         return utf_new(text, strlen(text));
679 }
680
681
682 /* utf_new_char_classname ******************************************************
683
684    Creates a new utf symbol, the text for this symbol is passed as a
685    c-string ( = char* ) "." characters are going to be replaced by
686    "/". Since the above function is used often, this is a separte
687    function, instead of an if.
688
689 *******************************************************************************/
690
691 utf *utf_new_char_classname(const char *text)
692 {
693         if (strchr(text, '.')) {
694                 char *txt = strdup(text);
695                 char *end = txt + strlen(txt);
696                 char *c;
697                 utf *tmpRes;
698
699                 for (c = txt; c < end; c++)
700                         if (*c == '.') *c = '/';
701
702                 tmpRes = utf_new(txt, strlen(txt));
703                 FREE(txt, 0);
704
705                 return tmpRes;
706
707         } else
708                 return utf_new(text, strlen(text));
709 }
710
711
712 /* utf_nextu2 ******************************************************************
713
714    Read the next unicode character from the utf string and increment
715    the utf-string pointer accordingly.
716
717 *******************************************************************************/
718
719 u2 utf_nextu2(char **utf_ptr)
720 {
721     /* uncompressed unicode character */
722     u2 unicode_char = 0;
723     /* current position in utf text */  
724     unsigned char *utf = (unsigned char *) (*utf_ptr);
725     /* bytes representing the unicode character */
726     unsigned char ch1, ch2, ch3;
727     /* number of bytes used to represent the unicode character */
728     int len = 0;
729         
730     switch ((ch1 = utf[0]) >> 4) {
731         default: /* 1 byte */
732                 (*utf_ptr)++;
733                 return (u2) ch1;
734         case 0xC: 
735         case 0xD: /* 2 bytes */
736                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
737                         unsigned char high = ch1 & 0x1F;
738                         unsigned char low  = ch2 & 0x3F;
739                         unicode_char = (high << 6) + low;
740                         len = 2;
741                 }
742                 break;
743
744         case 0xE: /* 2 or 3 bytes */
745                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
746                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
747                                 unsigned char low  = ch3 & 0x3f;
748                                 unsigned char mid  = ch2 & 0x3f;
749                                 unsigned char high = ch1 & 0x0f;
750                                 unicode_char = (((high << 6) + mid) << 6) + low;
751                                 len = 3;
752                         } else
753                                 len = 2;                                           
754                 }
755                 break;
756     }
757
758     /* update position in utf-text */
759     *utf_ptr = (char *) (utf + len);
760
761     return unicode_char;
762 }
763
764
765 /* utf_strlen ******************************************************************
766
767    Determine number of unicode characters in the utf string.
768
769 *******************************************************************************/
770
771 u4 utf_strlen(utf *u)
772 {
773         char *endpos;                       /* points behind utf string           */
774         char *utf_ptr;                      /* current position in utf text       */
775         u4 len = 0;                         /* number of unicode characters       */
776
777         if (!u) {
778                 exceptions_throw_nullpointerexception();
779                 return 0;
780         }
781
782         endpos = UTF_END(u);
783         utf_ptr = u->text;
784
785         while (utf_ptr < endpos) {
786                 len++;
787                 /* next unicode character */
788                 utf_nextu2(&utf_ptr);
789         }
790
791         if (utf_ptr != endpos)
792                 /* string ended abruptly */
793                 throw_cacao_exception_exit(string_java_lang_InternalError,
794                                                                    "Illegal utf8 string");
795
796         return len;
797 }
798
799
800 /* u2_utflength ****************************************************************
801
802    Returns the utf length in bytes of a u2 array.
803
804 *******************************************************************************/
805
806 u4 u2_utflength(u2 *text, u4 u2_length)
807 {
808         u4 result_len = 0;                  /* utf length in bytes                */
809         u2 ch;                              /* current unicode character          */
810         u4 len;
811         
812         for (len = 0; len < u2_length; len++) {
813                 /* next unicode character */
814                 ch = *text++;
815           
816                 /* determine bytes required to store unicode character as utf */
817                 if (ch && (ch < 0x80)) 
818                         result_len++;
819                 else if (ch < 0x800)
820                         result_len += 2;        
821                 else 
822                         result_len += 3;        
823         }
824
825     return result_len;
826 }
827
828
829 /* utf_display *****************************************************************
830
831    Write utf symbol to stdout (for debugging purposes).
832
833 *******************************************************************************/
834
835 void utf_display(utf *u)
836 {
837         char *endpos;                       /* points behind utf string           */
838         char *utf_ptr;                      /* current position in utf text       */
839
840         if (u == NULL) {
841                 printf("NULL");
842                 fflush(stdout);
843                 return;
844         }
845
846         endpos = UTF_END(u);
847         utf_ptr = u->text;
848
849         while (utf_ptr < endpos) {
850                 /* read next unicode character */
851
852                 u2 c = utf_nextu2(&utf_ptr);
853
854                 if ((c >= 32) && (c <= 127))
855                         printf("%c", c);
856                 else
857                         printf("?");
858         }
859
860         fflush(stdout);
861 }
862
863
864 /* utf_display_classname *******************************************************
865
866    Write utf symbol to stdout with `/' converted to `.' (for debugging
867    purposes).
868
869 *******************************************************************************/
870
871 void utf_display_classname(utf *u)
872 {
873         char *endpos;                       /* points behind utf string           */
874         char *utf_ptr;                      /* current position in utf text       */
875
876         if (u == NULL) {
877                 printf("NULL");
878                 fflush(stdout);
879                 return;
880         }
881
882         endpos = UTF_END(u);
883         utf_ptr = u->text;
884
885         while (utf_ptr < endpos) {
886                 /* read next unicode character */
887
888                 u2 c = utf_nextu2(&utf_ptr);
889
890                 if (c == '/')
891                         c = '.';
892
893                 if ((c >= 32) && (c <= 127))
894                         printf("%c", c);
895                 else
896                         printf("?");
897         }
898
899         fflush(stdout);
900 }
901
902
903 /* utf_sprint ******************************************************************
904         
905    Write utf symbol into c-string (for debugging purposes).
906
907 *******************************************************************************/
908
909 void utf_sprint(char *buffer, utf *u)
910 {
911         char *endpos;                       /* points behind utf string           */
912         char *utf_ptr;                      /* current position in utf text       */
913         u2 pos = 0;                         /* position in c-string               */
914
915         if (!u) {
916                 strcpy(buffer, "NULL");
917                 return;
918         }
919
920         endpos = UTF_END(u);
921         utf_ptr = u->text;
922
923         while (utf_ptr < endpos) 
924                 /* copy next unicode character */       
925                 buffer[pos++] = utf_nextu2(&utf_ptr);
926
927         /* terminate string */
928         buffer[pos] = '\0';
929 }
930
931
932 /* utf_sprint_classname ********************************************************
933         
934    Write utf symbol into c-string with `/' converted to `.' (for debugging
935    purposes).
936
937 *******************************************************************************/
938
939 void utf_sprint_classname(char *buffer, utf *u)
940 {
941         char *endpos;                       /* points behind utf string           */
942         char *utf_ptr;                      /* current position in utf text       */
943         u2 pos = 0;                         /* position in c-string               */
944
945         if (!u) {
946                 strcpy(buffer, "NULL");
947                 return;
948         }
949
950         endpos = UTF_END(u);
951         utf_ptr = u->text;
952
953         while (utf_ptr < endpos) {
954                 /* copy next unicode character */       
955                 u2 c = utf_nextu2(&utf_ptr);
956                 if (c == '/') c = '.';
957                 buffer[pos++] = c;
958         }
959
960         /* terminate string */
961         buffer[pos] = '\0';
962 }
963
964
965 /* utf_strcat ******************************************************************
966         
967    Like libc strcat, but uses an utf8 string.
968
969 *******************************************************************************/
970
971 void utf_strcat(char *buffer, utf *u)
972 {
973         utf_sprint(buffer + strlen(buffer), u);
974 }
975
976
977 /* utf_strcat_classname ********************************************************
978         
979    Like libc strcat, but uses an utf8 string.
980
981 *******************************************************************************/
982
983 void utf_strcat_classname(char *buffer, utf *u)
984 {
985         utf_sprint_classname(buffer + strlen(buffer), u);
986 }
987
988
989 /* utf_fprint ******************************************************************
990         
991    Write utf symbol into file.
992
993 *******************************************************************************/
994
995 void utf_fprint(FILE *file, utf *u)
996 {
997         char *endpos;                       /* points behind utf string           */
998         char *utf_ptr;                      /* current position in utf text       */
999
1000         if (!u)
1001                 return;
1002
1003         endpos = UTF_END(u);
1004         utf_ptr = u->text;
1005
1006         while (utf_ptr < endpos) { 
1007                 /* read next unicode character */                
1008                 u2 c = utf_nextu2(&utf_ptr);                            
1009
1010                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1011                 else fprintf(file, "?");
1012         }
1013 }
1014
1015
1016 /* utf_fprint_classname ********************************************************
1017         
1018    Write utf symbol into file with `/' converted to `.'.
1019
1020 *******************************************************************************/
1021
1022 void utf_fprint_classname(FILE *file, utf *u)
1023 {
1024         char *endpos;                       /* points behind utf string           */
1025         char *utf_ptr;                      /* current position in utf text       */
1026
1027     if (!u)
1028                 return;
1029
1030         endpos = UTF_END(u);
1031         utf_ptr = u->text;
1032
1033         while (utf_ptr < endpos) { 
1034                 /* read next unicode character */                
1035                 u2 c = utf_nextu2(&utf_ptr);                            
1036                 if (c == '/') c = '.';
1037
1038                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1039                 else fprintf(file, "?");
1040         }
1041 }
1042
1043
1044 /* is_valid_utf ****************************************************************
1045
1046    Return true if the given string is a valid UTF-8 string.
1047
1048    utf_ptr...points to first character
1049    end_pos...points after last character
1050
1051 *******************************************************************************/
1052
1053 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1054
1055 bool is_valid_utf(char *utf_ptr, char *end_pos)
1056 {
1057         int bytes;
1058         int len,i;
1059         char c;
1060         unsigned long v;
1061
1062         if (end_pos < utf_ptr) return false;
1063         bytes = end_pos - utf_ptr;
1064         while (bytes--) {
1065                 c = *utf_ptr++;
1066
1067                 if (!c) return false;                     /* 0x00 is not allowed */
1068                 if ((c & 0x80) == 0) continue;            /* ASCII */
1069
1070                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1071                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1072                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1073                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1074                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1075                 else return false;                        /* invalid leading byte */
1076
1077                 if (len > 2) return false;                /* Java limitation */
1078
1079                 v = (unsigned long)c & (0x3f >> len);
1080                 
1081                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1082
1083                 for (i = len; i--; ) {
1084                         c = *utf_ptr++;
1085                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1086                                 return false;
1087                         v = (v << 6) | (c & 0x3f);
1088                 }
1089
1090                 if (v == 0) {
1091                         if (len != 1) return false;           /* Java special */
1092
1093                 } else {
1094                         /* Sun Java seems to allow overlong UTF-8 encodings */
1095                         
1096                         /* if (v < min_codepoint[len]) */
1097                                 /* XXX throw exception? */
1098                 }
1099
1100                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1101                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1102
1103                 /* even these seem to be allowed */
1104                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1105         }
1106
1107         return true;
1108 }
1109
1110
1111 /* is_valid_name ***************************************************************
1112
1113    Return true if the given string may be used as a class/field/method
1114    name. (Currently this only disallows empty strings and control
1115    characters.)
1116
1117    NOTE: The string is assumed to have passed is_valid_utf!
1118
1119    utf_ptr...points to first character
1120    end_pos...points after last character
1121
1122 *******************************************************************************/
1123
1124 bool is_valid_name(char *utf_ptr, char *end_pos)
1125 {
1126         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1127
1128         while (utf_ptr < end_pos) {
1129                 unsigned char c = *utf_ptr++;
1130
1131                 if (c < 0x20) return false; /* disallow control characters */
1132                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1133                         return false;
1134         }
1135
1136         return true;
1137 }
1138
1139 bool is_valid_name_utf(utf *u)
1140 {
1141         return is_valid_name(u->text, UTF_END(u));
1142 }
1143
1144
1145 /* utf_show ********************************************************************
1146
1147    Writes the utf symbols in the utfhash to stdout and displays the
1148    number of external hash chains grouped according to the chainlength
1149    (for debugging purposes).
1150
1151 *******************************************************************************/
1152
1153 #if !defined(NDEBUG)
1154 void utf_show(void)
1155 {
1156
1157 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1158
1159         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1160         u4 max_chainlength = 0;      /* maximum length of the chains */
1161         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1162         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1163         u4 i;
1164
1165         printf("UTF-HASH:\n");
1166
1167         /* show element of utf-hashtable */
1168
1169         for (i = 0; i < hashtable_utf.size; i++) {
1170                 utf *u = hashtable_utf.ptr[i];
1171
1172                 if (u) {
1173                         printf("SLOT %d: ", (int) i);
1174
1175                         while (u) {
1176                                 printf("'");
1177                                 utf_display(u);
1178                                 printf("' ");
1179                                 u = u->hashlink;
1180                         }       
1181                         printf("\n");
1182                 }
1183         }
1184
1185         printf("UTF-HASH: %d slots for %d entries\n", 
1186                    (int) hashtable_utf.size, (int) hashtable_utf.entries );
1187
1188         if (hashtable_utf.entries == 0)
1189                 return;
1190
1191         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1192
1193         for (i=0;i<CHAIN_LIMIT;i++)
1194                 chain_count[i]=0;
1195
1196         /* count numbers of hashchains according to their length */
1197         for (i=0; i<hashtable_utf.size; i++) {
1198                   
1199                 utf *u = (utf*) hashtable_utf.ptr[i];
1200                 u4 chain_length = 0;
1201
1202                 /* determine chainlength */
1203                 while (u) {
1204                         u = u->hashlink;
1205                         chain_length++;
1206                 }
1207
1208                 /* update sum of all chainlengths */
1209                 sum_chainlength+=chain_length;
1210
1211                 /* determine the maximum length of the chains */
1212                 if (chain_length>max_chainlength)
1213                         max_chainlength = chain_length;
1214
1215                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1216                 if (chain_length>=CHAIN_LIMIT) {
1217                         beyond_limit+=chain_length;
1218                         chain_length=CHAIN_LIMIT-1;
1219                 }
1220
1221                 /* update number of hashchains of current length */
1222                 chain_count[chain_length]++;
1223         }
1224
1225         /* display results */  
1226         for (i=1;i<CHAIN_LIMIT-1;i++) 
1227                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf.entries));
1228           
1229         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf.entries);
1230
1231
1232         printf("max. chainlength:%5d\n",max_chainlength);
1233
1234         /* avg. chainlength = sum of chainlengths / number of chains */
1235         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf.size-chain_count[0]));
1236 }
1237 #endif /* !defined(NDEBUG) */
1238
1239
1240 /*
1241  * These are local overrides for various environment variables in Emacs.
1242  * Please do not remove this and leave it at the end of the file, where
1243  * Emacs will automagically detect them.
1244  * ---------------------------------------------------------------------
1245  * Local variables:
1246  * mode: c
1247  * indent-tabs-mode: t
1248  * c-basic-offset: 4
1249  * tab-width: 4
1250  * End:
1251  */