d8249fc26355cb35662487ab8e1c685f802fa966
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 4126 2006-01-10 20:55:41Z twisti $
34
35 */
36
37
38 #include <string.h>
39 #include <assert.h>
40
41 #include "config.h"
42 #include "vm/types.h"
43
44 #include "mm/memory.h"
45
46 #if defined(USE_THREADS)
47 # if defined(NATIVE_THREADS)
48 #  include "threads/native/threads.h"
49 # else
50 #  include "threads/green/threads.h"
51 # endif
52 #endif
53
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
60 #include "vm/utf8.h"
61
62 /* global variables ***********************************************************/
63
64 /* hashsize must be power of 2 */
65
66 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
67
68 hashtable hashtable_utf;                /* hashtable for utf8-symbols         */
69
70 #if defined(USE_THREADS)
71 static java_objectheader *lock_hashtable_utf;
72 #endif
73
74
75 /* utf-symbols for pointer comparison of frequently used strings **************/
76
77 utf *utf_java_lang_Object;              /* java/lang/Object                   */
78
79 utf *utf_java_lang_Class;
80 utf *utf_java_lang_ClassLoader;
81 utf *utf_java_lang_Cloneable;
82 utf *utf_java_lang_SecurityManager;
83 utf *utf_java_lang_String;
84 utf *utf_java_lang_System;
85 utf *utf_java_lang_ThreadGroup;
86 utf *utf_java_io_Serializable;
87
88 utf *utf_java_lang_Throwable;
89 utf *utf_java_lang_VMThrowable;
90 utf *utf_java_lang_Error;
91 utf *utf_java_lang_NoClassDefFoundError;
92 utf *utf_java_lang_LinkageError;
93 utf *utf_java_lang_NoSuchMethodError;
94 utf *utf_java_lang_OutOfMemoryError;
95
96 utf *utf_java_lang_Exception;
97 utf *utf_java_lang_ClassNotFoundException;
98 utf *utf_java_lang_IllegalArgumentException;
99 utf *utf_java_lang_IllegalMonitorStateException;
100
101 utf *utf_java_lang_NullPointerException;
102
103 utf* utf_java_lang_Void;
104 utf* utf_java_lang_Boolean;
105 utf* utf_java_lang_Byte;
106 utf* utf_java_lang_Character;
107 utf* utf_java_lang_Short;
108 utf* utf_java_lang_Integer;
109 utf* utf_java_lang_Long;
110 utf* utf_java_lang_Float;
111 utf* utf_java_lang_Double;
112
113 utf *utf_java_lang_StackTraceElement;
114 utf *utf_java_lang_reflect_Constructor;
115 utf *utf_java_lang_reflect_Field;
116 utf *utf_java_lang_reflect_Method;
117 utf *utf_java_util_Vector;
118
119 utf *utf_InnerClasses;                  /* InnerClasses                       */
120 utf *utf_ConstantValue;                 /* ConstantValue                      */
121 utf *utf_Code;                          /* Code                               */
122 utf *utf_Exceptions;                    /* Exceptions                         */
123 utf *utf_LineNumberTable;               /* LineNumberTable                    */
124 utf *utf_SourceFile;                    /* SourceFile                         */
125
126 utf *utf_init;                          /* <init>                             */
127 utf *utf_clinit;                        /* <clinit>                           */
128 utf *utf_clone;                         /* clone                              */
129 utf *utf_finalize;                      /* finalize                           */
130 utf *utf_run;                           /* run                                */
131
132 utf *utf_add;                           /* add                                */
133 utf *utf_remove;                        /* remove                             */
134 utf *utf_put;                           /* put                                */
135 utf *utf_get;                           /* get                                */
136 utf *utf_value;                         /* value                              */
137
138 utf *utf_fillInStackTrace;
139 utf *utf_getSystemClassLoader;
140 utf *utf_loadClass;
141 utf *utf_printStackTrace;
142
143 utf *utf_Z;                             /* Z                                  */
144 utf *utf_B;                             /* B                                  */
145 utf *utf_C;                             /* C                                  */
146 utf *utf_S;                             /* S                                  */
147 utf *utf_I;                             /* I                                  */
148 utf *utf_J;                             /* J                                  */
149 utf *utf_F;                             /* F                                  */
150 utf *utf_D;                             /* D                                  */
151
152 utf *utf_void__void;                    /* ()V                                */
153 utf *utf_boolean__void;                 /* (Z)V                               */
154 utf *utf_byte__void;                    /* (B)V                               */
155 utf *utf_char__void;                    /* (C)V                               */
156 utf *utf_short__void;                   /* (S)V                               */
157 utf *utf_int__void;                     /* (I)V                               */
158 utf *utf_long__void;                    /* (J)V                               */
159 utf *utf_float__void;                   /* (F)V                               */
160 utf *utf_double__void;                  /* (D)V                               */
161
162 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
163 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
164 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
165 utf *utf_java_lang_Object__java_lang_Object;
166 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
167 utf *utf_java_lang_String__java_lang_Class;
168 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
169
170 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
171
172 utf *array_packagename;
173
174
175 /* utf_init ********************************************************************
176
177    Initializes the utf8 subsystem.
178
179 *******************************************************************************/
180
181 bool utf8_init(void)
182 {
183         /* create utf8 hashtable */
184
185         hashtable_create(&hashtable_utf, HASHTABLE_UTF_SIZE);
186
187 #if defined(ENABLE_STATISTICS)
188         if (opt_stat)
189                 count_utf_len += sizeof(utf*) * hashtable_utf.size;
190 #endif
191
192 #if defined(USE_THREADS)
193         /* create utf hashtable lock object */
194
195         lock_hashtable_utf = NEW(java_objectheader);
196
197 # if defined(NATIVE_THREADS)
198         initObjectLock(lock_hashtable_utf);
199 # endif
200 #endif
201
202         /* create utf-symbols for pointer comparison of frequently used strings */
203
204         utf_java_lang_Object           = utf_new_char("java/lang/Object");
205
206         utf_java_lang_Class            = utf_new_char("java/lang/Class");
207         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
208         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
209         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
210         utf_java_lang_String           = utf_new_char("java/lang/String");
211         utf_java_lang_System           = utf_new_char("java/lang/System");
212         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
213         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
214
215         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
216         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
217         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
218
219         utf_java_lang_NoClassDefFoundError =
220                 utf_new_char(string_java_lang_NoClassDefFoundError);
221
222         utf_java_lang_LinkageError =
223                 utf_new_char(string_java_lang_LinkageError);
224
225         utf_java_lang_NoSuchMethodError =
226                 utf_new_char(string_java_lang_NoSuchMethodError);
227
228         utf_java_lang_OutOfMemoryError =
229                 utf_new_char(string_java_lang_OutOfMemoryError);
230
231         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
232
233         utf_java_lang_ClassNotFoundException =
234                 utf_new_char(string_java_lang_ClassNotFoundException);
235
236         utf_java_lang_IllegalArgumentException =
237                 utf_new_char(string_java_lang_IllegalArgumentException);
238
239         utf_java_lang_IllegalMonitorStateException =
240                 utf_new_char(string_java_lang_IllegalMonitorStateException);
241
242         utf_java_lang_NullPointerException =
243                 utf_new_char(string_java_lang_NullPointerException);
244
245         utf_java_lang_Void             = utf_new_char("java/lang/Void");
246         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
247         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
248         utf_java_lang_Character        = utf_new_char("java/lang/Character");
249         utf_java_lang_Short            = utf_new_char("java/lang/Short");
250         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
251         utf_java_lang_Long             = utf_new_char("java/lang/Long");
252         utf_java_lang_Float            = utf_new_char("java/lang/Float");
253         utf_java_lang_Double           = utf_new_char("java/lang/Double");
254
255         utf_java_lang_StackTraceElement =
256                 utf_new_char("java/lang/StackTraceElement");
257
258         utf_java_lang_reflect_Constructor =
259                 utf_new_char("java/lang/reflect/Constructor");
260
261         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
262         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
263         utf_java_util_Vector           = utf_new_char("java/util/Vector");
264
265         utf_InnerClasses               = utf_new_char("InnerClasses");
266         utf_ConstantValue              = utf_new_char("ConstantValue");
267         utf_Code                       = utf_new_char("Code");
268         utf_Exceptions                 = utf_new_char("Exceptions");
269         utf_LineNumberTable            = utf_new_char("LineNumberTable");
270         utf_SourceFile                 = utf_new_char("SourceFile");
271
272         utf_init                           = utf_new_char("<init>");
273         utf_clinit                         = utf_new_char("<clinit>");
274         utf_clone                      = utf_new_char("clone");
275         utf_finalize                   = utf_new_char("finalize");
276         utf_run                        = utf_new_char("run");
277
278         utf_add                        = utf_new_char("add");
279         utf_remove                     = utf_new_char("remove");
280         utf_put                        = utf_new_char("put");
281         utf_get                        = utf_new_char("get");
282         utf_value                      = utf_new_char("value");
283
284         utf_printStackTrace            = utf_new_char("printStackTrace");
285         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
286         utf_loadClass                  = utf_new_char("loadClass");
287         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
288
289         utf_Z                          = utf_new_char("Z");
290         utf_B                          = utf_new_char("B");
291         utf_C                          = utf_new_char("C");
292         utf_S                          = utf_new_char("S");
293         utf_I                          = utf_new_char("I");
294         utf_J                          = utf_new_char("J");
295         utf_F                          = utf_new_char("F");
296         utf_D                          = utf_new_char("D");
297
298         utf_void__void                 = utf_new_char("()V");
299         utf_boolean__void              = utf_new_char("(Z)V");
300         utf_byte__void                 = utf_new_char("(B)V");
301         utf_char__void                 = utf_new_char("(C)V");
302         utf_short__void                = utf_new_char("(S)V");
303         utf_int__void                  = utf_new_char("(I)V");
304         utf_long__void                 = utf_new_char("(J)V");
305         utf_float__void                = utf_new_char("(F)V");
306         utf_double__void               = utf_new_char("(D)V");
307         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
308         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
309
310         utf_void__java_lang_ClassLoader =
311                 utf_new_char("()Ljava/lang/ClassLoader;");
312
313         utf_java_lang_Object__java_lang_Object =
314                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
315
316         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
317
318         utf_java_lang_String__java_lang_Class =
319                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
320
321         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
322
323         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
324
325         array_packagename              = utf_new_char("\t<the array package>");
326
327         /* everything's ok */
328
329         return true;
330 }
331
332
333 /* utf_hashkey *****************************************************************
334
335    The hashkey is computed from the utf-text by using up to 8
336    characters.  For utf-symbols longer than 15 characters 3 characters
337    are taken from the beginning and the end, 2 characters are taken
338    from the middle.
339
340 *******************************************************************************/
341
342 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
343 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
344
345 u4 utf_hashkey(const char *text, u4 length)
346 {
347         const char *start_pos = text;       /* pointer to utf text                */
348         u4 a;
349
350         switch (length) {
351         case 0: /* empty string */
352                 return 0;
353
354         case 1: return fbs(0);
355         case 2: return fbs(0) ^ nbs(3);
356         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
357         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
358         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
359         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
360         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
361         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
362
363         case 9:
364                 a = fbs(0);
365                 a ^= nbs(1);
366                 a ^= nbs(2);
367                 text++;
368                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
369
370         case 10:
371                 a = fbs(0);
372                 text++;
373                 a ^= nbs(2);
374                 a ^= nbs(3);
375                 a ^= nbs(4);
376                 text++;
377                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
378
379         case 11:
380                 a = fbs(0);
381                 text++;
382                 a ^= nbs(2);
383                 a ^= nbs(3);
384                 a ^= nbs(4);
385                 text++;
386                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
387
388         case 12:
389                 a = fbs(0);
390                 text += 2;
391                 a ^= nbs(2);
392                 a ^= nbs(3);
393                 text++;
394                 a ^= nbs(5);
395                 a ^= nbs(6);
396                 a ^= nbs(7);
397                 text++;
398                 return a ^ nbs(9) ^ nbs(10);
399
400         case 13:
401                 a = fbs(0);
402                 a ^= nbs(1);
403                 text++;
404                 a ^= nbs(3);
405                 a ^= nbs(4);
406                 text += 2;      
407                 a ^= nbs(7);
408                 a ^= nbs(8);
409                 text += 2;
410                 return a ^ nbs(9) ^ nbs(10);
411
412         case 14:
413                 a = fbs(0);
414                 text += 2;      
415                 a ^= nbs(3);
416                 a ^= nbs(4);
417                 text += 2;      
418                 a ^= nbs(7);
419                 a ^= nbs(8);
420                 text += 2;
421                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
422
423         case 15:
424                 a = fbs(0);
425                 text += 2;      
426                 a ^= nbs(3);
427                 a ^= nbs(4);
428                 text += 2;      
429                 a ^= nbs(7);
430                 a ^= nbs(8);
431                 text += 2;
432                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
433
434         default:  /* 3 characters from beginning */
435                 a = fbs(0);
436                 text += 2;
437                 a ^= nbs(3);
438                 a ^= nbs(4);
439
440                 /* 2 characters from middle */
441                 text = start_pos + (length / 2);
442                 a ^= fbs(5);
443                 text += 2;
444                 a ^= nbs(6);    
445
446                 /* 3 characters from end */
447                 text = start_pos + length - 4;
448
449                 a ^= fbs(7);
450                 text++;
451
452                 return a ^ nbs(10) ^ nbs(11);
453     }
454 }
455
456
457 /* utf_hashkey *****************************************************************
458
459    Compute the hashkey of a unicode string.
460
461 *******************************************************************************/
462
463 u4 unicode_hashkey(u2 *text, u2 len)
464 {
465         return utf_hashkey((char *) text, len);
466 }
467
468
469 /* utf_new *********************************************************************
470
471    Creates a new utf-symbol, the text of the symbol is passed as a
472    u1-array. The function searches the utf-hashtable for a utf-symbol
473    with this text. On success the element returned, otherwise a new
474    hashtable element is created.
475
476    If the number of entries in the hashtable exceeds twice the size of
477    the hashtable slots a reorganization of the hashtable is done and
478    the utf symbols are copied to a new hashtable with doubled size.
479
480 *******************************************************************************/
481
482 utf *utf_new(const char *text, u2 length)
483 {
484         u4 key;                             /* hashkey computed from utf-text     */
485         u4 slot;                            /* slot in hashtable                  */
486         utf *u;                             /* hashtable element                  */
487         u2 i;
488
489 #if defined(USE_THREADS)
490         builtin_monitorenter(lock_hashtable_utf);
491 #endif
492
493 #if defined(ENABLE_STATISTICS)
494         if (opt_stat)
495                 count_utf_new++;
496 #endif
497
498         key  = utf_hashkey(text, length);
499         slot = key & (hashtable_utf.size - 1);
500         u    = hashtable_utf.ptr[slot];
501
502         /* search external hash chain for utf-symbol */
503
504         while (u) {
505                 if (u->blength == length) {
506                         /* compare text of hashtable elements */
507
508                         for (i = 0; i < length; i++)
509                                 if (text[i] != u->text[i])
510                                         goto nomatch;
511                         
512 #if defined(ENABLE_STATISTICS)
513                         if (opt_stat)
514                                 count_utf_new_found++;
515 #endif
516
517                         /* symbol found in hashtable */
518
519 #if defined(USE_THREADS)
520                         builtin_monitorexit(lock_hashtable_utf);
521 #endif
522
523                         return u;
524                 }
525
526         nomatch:
527                 u = u->hashlink; /* next element in external chain */
528         }
529
530 #if defined(ENABLE_STATISTICS)
531         if (opt_stat)
532                 count_utf_len += sizeof(utf) + length + 1;
533 #endif
534
535         /* location in hashtable found, create new utf element */
536         u = NEW(utf);
537         u->blength  = length;               /* length in bytes of utfstring       */
538         u->hashlink = hashtable_utf.ptr[slot]; /* link in external hashchain      */
539         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
540
541         memcpy(u->text, text, length);      /* copy utf-text                      */
542         u->text[length] = '\0';
543
544         hashtable_utf.ptr[slot] = u;        /* insert symbol into table           */
545         hashtable_utf.entries++;            /* update number of entries           */
546
547         if (hashtable_utf.entries > (hashtable_utf.size * 2)) {
548
549         /* reorganization of hashtable, average length of the external
550            chains is approx. 2 */
551
552                 hashtable  newhash;                              /* the new hashtable */
553                 u4         i;
554                 utf       *u;
555                 utf       *nextu;
556                 u4         slot;
557
558                 /* create new hashtable, double the size */
559
560                 hashtable_create(&newhash, hashtable_utf.size * 2);
561                 newhash.entries = hashtable_utf.entries;
562
563 #if defined(ENABLE_STATISTICS)
564                 if (opt_stat)
565                         count_utf_len += sizeof(utf*) * hashtable_utf.size;
566 #endif
567
568                 /* transfer elements to new hashtable */
569
570                 for (i = 0; i < hashtable_utf.size; i++) {
571                         u = hashtable_utf.ptr[i];
572
573                         while (u) {
574                                 nextu = u->hashlink;
575                                 slot  = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
576                                                 
577                                 u->hashlink = (utf *) newhash.ptr[slot];
578                                 newhash.ptr[slot] = u;
579
580                                 /* follow link in external hash chain */
581
582                                 u = nextu;
583                         }
584                 }
585         
586                 /* dispose old table */
587
588                 MFREE(hashtable_utf.ptr, void*, hashtable_utf.size);
589                 hashtable_utf = newhash;
590         }
591
592 #if defined(USE_THREADS)
593         builtin_monitorexit(lock_hashtable_utf);
594 #endif
595
596         return u;
597 }
598
599
600 /* utf_new_u2 ******************************************************************
601
602    Make utf symbol from u2 array, if isclassname is true '.' is
603    replaced by '/'.
604
605 *******************************************************************************/
606
607 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
608 {
609         char *buffer;                   /* memory buffer for  unicode characters  */
610         char *pos;                      /* pointer to current position in buffer  */
611         u4 left;                        /* unicode characters left                */
612         u4 buflength;                   /* utf length in bytes of the u2 array    */
613         utf *result;                    /* resulting utf-string                   */
614         int i;          
615
616         /* determine utf length in bytes and allocate memory */
617
618         buflength = u2_utflength(unicode_pos, unicode_length); 
619         buffer    = MNEW(char, buflength);
620  
621         left = buflength;
622         pos  = buffer;
623
624         for (i = 0; i++ < unicode_length; unicode_pos++) {
625                 /* next unicode character */
626                 u2 c = *unicode_pos;
627                 
628                 if ((c != 0) && (c < 0x80)) {
629                         /* 1 character */       
630                         left--;
631                 if ((int) left < 0) break;
632                         /* convert classname */
633                         if (isclassname && c == '.')
634                                 *pos++ = '/';
635                         else
636                                 *pos++ = (char) c;
637
638                 } else if (c < 0x800) {             
639                         /* 2 characters */                              
640                 unsigned char high = c >> 6;
641                 unsigned char low  = c & 0x3F;
642                         left = left - 2;
643                 if ((int) left < 0) break;
644                 *pos++ = high | 0xC0; 
645                 *pos++ = low  | 0x80;     
646
647                 } else {         
648                 /* 3 characters */                              
649                 char low  = c & 0x3f;
650                 char mid  = (c >> 6) & 0x3F;
651                 char high = c >> 12;
652                         left = left - 3;
653                 if ((int) left < 0) break;
654                 *pos++ = high | 0xE0; 
655                 *pos++ = mid  | 0x80;  
656                 *pos++ = low  | 0x80;   
657                 }
658         }
659         
660         /* insert utf-string into symbol-table */
661         result = utf_new(buffer,buflength);
662
663         MFREE(buffer, char, buflength);
664
665         return result;
666 }
667
668
669 /* utf_new_char ****************************************************************
670
671    Creates a new utf symbol, the text for this symbol is passed as a
672    c-string ( = char* ).
673
674 *******************************************************************************/
675
676 utf *utf_new_char(const char *text)
677 {
678         return utf_new(text, strlen(text));
679 }
680
681
682 /* utf_new_char_classname ******************************************************
683
684    Creates a new utf symbol, the text for this symbol is passed as a
685    c-string ( = char* ) "." characters are going to be replaced by
686    "/". Since the above function is used often, this is a separte
687    function, instead of an if.
688
689 *******************************************************************************/
690
691 utf *utf_new_char_classname(const char *text)
692 {
693         if (strchr(text, '.')) {
694                 char *txt = strdup(text);
695                 char *end = txt + strlen(txt);
696                 char *c;
697                 utf *tmpRes;
698
699                 for (c = txt; c < end; c++)
700                         if (*c == '.') *c = '/';
701
702                 tmpRes = utf_new(txt, strlen(txt));
703                 FREE(txt, 0);
704
705                 return tmpRes;
706
707         } else
708                 return utf_new(text, strlen(text));
709 }
710
711
712 /* utf_nextu2 ******************************************************************
713
714    Read the next unicode character from the utf string and increment
715    the utf-string pointer accordingly.
716
717 *******************************************************************************/
718
719 u2 utf_nextu2(char **utf_ptr)
720 {
721     /* uncompressed unicode character */
722     u2 unicode_char = 0;
723     /* current position in utf text */  
724     unsigned char *utf = (unsigned char *) (*utf_ptr);
725     /* bytes representing the unicode character */
726     unsigned char ch1, ch2, ch3;
727     /* number of bytes used to represent the unicode character */
728     int len = 0;
729         
730     switch ((ch1 = utf[0]) >> 4) {
731         default: /* 1 byte */
732                 (*utf_ptr)++;
733                 return (u2) ch1;
734         case 0xC: 
735         case 0xD: /* 2 bytes */
736                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
737                         unsigned char high = ch1 & 0x1F;
738                         unsigned char low  = ch2 & 0x3F;
739                         unicode_char = (high << 6) + low;
740                         len = 2;
741                 }
742                 break;
743
744         case 0xE: /* 2 or 3 bytes */
745                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
746                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
747                                 unsigned char low  = ch3 & 0x3f;
748                                 unsigned char mid  = ch2 & 0x3f;
749                                 unsigned char high = ch1 & 0x0f;
750                                 unicode_char = (((high << 6) + mid) << 6) + low;
751                                 len = 3;
752                         } else
753                                 len = 2;                                           
754                 }
755                 break;
756     }
757
758     /* update position in utf-text */
759     *utf_ptr = (char *) (utf + len);
760
761     return unicode_char;
762 }
763
764
765 /* utf_strlen ******************************************************************
766
767    Determine number of unicode characters in the utf string.
768
769 *******************************************************************************/
770
771 u4 utf_strlen(utf *u)
772 {
773         char *endpos;                       /* points behind utf string           */
774         char *utf_ptr;                      /* current position in utf text       */
775         u4 len = 0;                         /* number of unicode characters       */
776
777         if (!u) {
778                 exceptions_throw_nullpointerexception();
779                 return 0;
780         }
781
782         endpos = UTF_END(u);
783         utf_ptr = u->text;
784
785         while (utf_ptr < endpos) {
786                 len++;
787                 /* next unicode character */
788                 utf_nextu2(&utf_ptr);
789         }
790
791         if (utf_ptr != endpos)
792                 /* string ended abruptly */
793                 throw_cacao_exception_exit(string_java_lang_InternalError,
794                                                                    "Illegal utf8 string");
795
796         return len;
797 }
798
799
800 /* u2_utflength ****************************************************************
801
802    Returns the utf length in bytes of a u2 array.
803
804 *******************************************************************************/
805
806 u4 u2_utflength(u2 *text, u4 u2_length)
807 {
808         u4 result_len = 0;                  /* utf length in bytes                */
809         u2 ch;                              /* current unicode character          */
810         u4 len;
811         
812         for (len = 0; len < u2_length; len++) {
813                 /* next unicode character */
814                 ch = *text++;
815           
816                 /* determine bytes required to store unicode character as utf */
817                 if (ch && (ch < 0x80)) 
818                         result_len++;
819                 else if (ch < 0x800)
820                         result_len += 2;        
821                 else 
822                         result_len += 3;        
823         }
824
825     return result_len;
826 }
827
828
829 /* utf_display *****************************************************************
830
831    Write utf symbol to stdout (for debugging purposes).
832
833 *******************************************************************************/
834
835 void utf_display(utf *u)
836 {
837         char *endpos;                       /* points behind utf string           */
838         char *utf_ptr;                      /* current position in utf text       */
839
840         if (!u) {
841                 printf("NULL");
842                 fflush(stdout);
843                 return;
844         }
845
846         endpos = UTF_END(u);
847         utf_ptr = u->text;
848
849         while (utf_ptr < endpos) {
850                 /* read next unicode character */                
851                 u2 c = utf_nextu2(&utf_ptr);
852                 if (c >= 32 && c <= 127) printf("%c", c);
853                 else printf("?");
854         }
855
856         fflush(stdout);
857 }
858
859
860 /* utf_display_classname *******************************************************
861
862    Write utf symbol to stdout with `/' converted to `.' (for debugging
863    purposes).
864
865 *******************************************************************************/
866
867 void utf_display_classname(utf *u)
868 {
869         char *endpos;                       /* points behind utf string           */
870         char *utf_ptr;                      /* current position in utf text       */
871
872         if (!u) {
873                 printf("NULL");
874                 fflush(stdout);
875                 return;
876         }
877
878         endpos = UTF_END(u);
879         utf_ptr = u->text;
880
881         while (utf_ptr < endpos) {
882                 /* read next unicode character */                
883                 u2 c = utf_nextu2(&utf_ptr);
884                 if (c == '/') c = '.';
885                 if (c >= 32 && c <= 127) printf("%c", c);
886                 else printf("?");
887         }
888
889         fflush(stdout);
890 }
891
892
893 /* utf_sprint ******************************************************************
894         
895    Write utf symbol into c-string (for debugging purposes).
896
897 *******************************************************************************/
898
899 void utf_sprint(char *buffer, utf *u)
900 {
901         char *endpos;                       /* points behind utf string           */
902         char *utf_ptr;                      /* current position in utf text       */
903         u2 pos = 0;                         /* position in c-string               */
904
905         if (!u) {
906                 strcpy(buffer, "NULL");
907                 return;
908         }
909
910         endpos = UTF_END(u);
911         utf_ptr = u->text;
912
913         while (utf_ptr < endpos) 
914                 /* copy next unicode character */       
915                 buffer[pos++] = utf_nextu2(&utf_ptr);
916
917         /* terminate string */
918         buffer[pos] = '\0';
919 }
920
921
922 /* utf_sprint_classname ********************************************************
923         
924    Write utf symbol into c-string with `/' converted to `.' (for debugging
925    purposes).
926
927 *******************************************************************************/
928
929 void utf_sprint_classname(char *buffer, utf *u)
930 {
931         char *endpos;                       /* points behind utf string           */
932         char *utf_ptr;                      /* current position in utf text       */
933         u2 pos = 0;                         /* position in c-string               */
934
935         if (!u) {
936                 strcpy(buffer, "NULL");
937                 return;
938         }
939
940         endpos = UTF_END(u);
941         utf_ptr = u->text;
942
943         while (utf_ptr < endpos) {
944                 /* copy next unicode character */       
945                 u2 c = utf_nextu2(&utf_ptr);
946                 if (c == '/') c = '.';
947                 buffer[pos++] = c;
948         }
949
950         /* terminate string */
951         buffer[pos] = '\0';
952 }
953
954
955 /* utf_strcat ******************************************************************
956         
957    Like libc strcat, but uses an utf8 string.
958
959 *******************************************************************************/
960
961 void utf_strcat(char *buffer, utf *u)
962 {
963         utf_sprint(buffer + strlen(buffer), u);
964 }
965
966
967 /* utf_strcat_classname ********************************************************
968         
969    Like libc strcat, but uses an utf8 string.
970
971 *******************************************************************************/
972
973 void utf_strcat_classname(char *buffer, utf *u)
974 {
975         utf_sprint_classname(buffer + strlen(buffer), u);
976 }
977
978
979 /* utf_fprint ******************************************************************
980         
981    Write utf symbol into file.
982
983 *******************************************************************************/
984
985 void utf_fprint(FILE *file, utf *u)
986 {
987         char *endpos;                       /* points behind utf string           */
988         char *utf_ptr;                      /* current position in utf text       */
989
990         if (!u)
991                 return;
992
993         endpos = UTF_END(u);
994         utf_ptr = u->text;
995
996         while (utf_ptr < endpos) { 
997                 /* read next unicode character */                
998                 u2 c = utf_nextu2(&utf_ptr);                            
999
1000                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1001                 else fprintf(file, "?");
1002         }
1003 }
1004
1005
1006 /* utf_fprint_classname ********************************************************
1007         
1008    Write utf symbol into file with `/' converted to `.'.
1009
1010 *******************************************************************************/
1011
1012 void utf_fprint_classname(FILE *file, utf *u)
1013 {
1014         char *endpos;                       /* points behind utf string           */
1015         char *utf_ptr;                      /* current position in utf text       */
1016
1017     if (!u)
1018                 return;
1019
1020         endpos = UTF_END(u);
1021         utf_ptr = u->text;
1022
1023         while (utf_ptr < endpos) { 
1024                 /* read next unicode character */                
1025                 u2 c = utf_nextu2(&utf_ptr);                            
1026                 if (c == '/') c = '.';
1027
1028                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1029                 else fprintf(file, "?");
1030         }
1031 }
1032
1033
1034 /* is_valid_utf ****************************************************************
1035
1036    Return true if the given string is a valid UTF-8 string.
1037
1038    utf_ptr...points to first character
1039    end_pos...points after last character
1040
1041 *******************************************************************************/
1042
1043 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1044
1045 bool is_valid_utf(char *utf_ptr, char *end_pos)
1046 {
1047         int bytes;
1048         int len,i;
1049         char c;
1050         unsigned long v;
1051
1052         if (end_pos < utf_ptr) return false;
1053         bytes = end_pos - utf_ptr;
1054         while (bytes--) {
1055                 c = *utf_ptr++;
1056
1057                 if (!c) return false;                     /* 0x00 is not allowed */
1058                 if ((c & 0x80) == 0) continue;            /* ASCII */
1059
1060                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1061                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1062                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1063                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1064                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1065                 else return false;                        /* invalid leading byte */
1066
1067                 if (len > 2) return false;                /* Java limitation */
1068
1069                 v = (unsigned long)c & (0x3f >> len);
1070                 
1071                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1072
1073                 for (i = len; i--; ) {
1074                         c = *utf_ptr++;
1075                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1076                                 return false;
1077                         v = (v << 6) | (c & 0x3f);
1078                 }
1079
1080                 if (v == 0) {
1081                         if (len != 1) return false;           /* Java special */
1082
1083                 } else {
1084                         /* Sun Java seems to allow overlong UTF-8 encodings */
1085                         
1086                         /* if (v < min_codepoint[len]) */
1087                                 /* XXX throw exception? */
1088                 }
1089
1090                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1091                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1092
1093                 /* even these seem to be allowed */
1094                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1095         }
1096
1097         return true;
1098 }
1099
1100
1101 /* is_valid_name ***************************************************************
1102
1103    Return true if the given string may be used as a class/field/method
1104    name. (Currently this only disallows empty strings and control
1105    characters.)
1106
1107    NOTE: The string is assumed to have passed is_valid_utf!
1108
1109    utf_ptr...points to first character
1110    end_pos...points after last character
1111
1112 *******************************************************************************/
1113
1114 bool is_valid_name(char *utf_ptr, char *end_pos)
1115 {
1116         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1117
1118         while (utf_ptr < end_pos) {
1119                 unsigned char c = *utf_ptr++;
1120
1121                 if (c < 0x20) return false; /* disallow control characters */
1122                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1123                         return false;
1124         }
1125
1126         return true;
1127 }
1128
1129 bool is_valid_name_utf(utf *u)
1130 {
1131         return is_valid_name(u->text, UTF_END(u));
1132 }
1133
1134
1135 /* utf_show ********************************************************************
1136
1137    Writes the utf symbols in the utfhash to stdout and displays the
1138    number of external hash chains grouped according to the chainlength
1139    (for debugging purposes).
1140
1141 *******************************************************************************/
1142
1143 #if !defined(NDEBUG)
1144 void utf_show(void)
1145 {
1146
1147 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1148
1149         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1150         u4 max_chainlength = 0;      /* maximum length of the chains */
1151         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1152         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1153         u4 i;
1154
1155         printf("UTF-HASH:\n");
1156
1157         /* show element of utf-hashtable */
1158
1159         for (i = 0; i < hashtable_utf.size; i++) {
1160                 utf *u = hashtable_utf.ptr[i];
1161
1162                 if (u) {
1163                         printf("SLOT %d: ", (int) i);
1164
1165                         while (u) {
1166                                 printf("'");
1167                                 utf_display(u);
1168                                 printf("' ");
1169                                 u = u->hashlink;
1170                         }       
1171                         printf("\n");
1172                 }
1173         }
1174
1175         printf("UTF-HASH: %d slots for %d entries\n", 
1176                    (int) hashtable_utf.size, (int) hashtable_utf.entries );
1177
1178         if (hashtable_utf.entries == 0)
1179                 return;
1180
1181         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1182
1183         for (i=0;i<CHAIN_LIMIT;i++)
1184                 chain_count[i]=0;
1185
1186         /* count numbers of hashchains according to their length */
1187         for (i=0; i<hashtable_utf.size; i++) {
1188                   
1189                 utf *u = (utf*) hashtable_utf.ptr[i];
1190                 u4 chain_length = 0;
1191
1192                 /* determine chainlength */
1193                 while (u) {
1194                         u = u->hashlink;
1195                         chain_length++;
1196                 }
1197
1198                 /* update sum of all chainlengths */
1199                 sum_chainlength+=chain_length;
1200
1201                 /* determine the maximum length of the chains */
1202                 if (chain_length>max_chainlength)
1203                         max_chainlength = chain_length;
1204
1205                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1206                 if (chain_length>=CHAIN_LIMIT) {
1207                         beyond_limit+=chain_length;
1208                         chain_length=CHAIN_LIMIT-1;
1209                 }
1210
1211                 /* update number of hashchains of current length */
1212                 chain_count[chain_length]++;
1213         }
1214
1215         /* display results */  
1216         for (i=1;i<CHAIN_LIMIT-1;i++) 
1217                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf.entries));
1218           
1219         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf.entries);
1220
1221
1222         printf("max. chainlength:%5d\n",max_chainlength);
1223
1224         /* avg. chainlength = sum of chainlengths / number of chains */
1225         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf.size-chain_count[0]));
1226 }
1227 #endif /* !defined(NDEBUG) */
1228
1229
1230 /*
1231  * These are local overrides for various environment variables in Emacs.
1232  * Please do not remove this and leave it at the end of the file, where
1233  * Emacs will automagically detect them.
1234  * ---------------------------------------------------------------------
1235  * Local variables:
1236  * mode: c
1237  * indent-tabs-mode: t
1238  * c-basic-offset: 4
1239  * tab-width: 4
1240  * End:
1241  */