* utf_java_lang_IllegalArgumentException: Added.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 3453 2005-10-19 22:03:06Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_NoClassDefFoundError;
69 utf *utf_java_lang_OutOfMemoryError;
70
71 utf *utf_java_lang_Exception;
72 utf *utf_java_lang_ClassNotFoundException;
73 utf *utf_java_lang_IllegalArgumentException;
74
75 utf* utf_java_lang_Void;
76 utf* utf_java_lang_Boolean;
77 utf* utf_java_lang_Byte;
78 utf* utf_java_lang_Character;
79 utf* utf_java_lang_Short;
80 utf* utf_java_lang_Integer;
81 utf* utf_java_lang_Long;
82 utf* utf_java_lang_Float;
83 utf* utf_java_lang_Double;
84
85 utf *utf_java_lang_StackTraceElement;
86 utf *utf_java_lang_reflect_Constructor;
87 utf *utf_java_lang_reflect_Field;
88 utf *utf_java_lang_reflect_Method;
89 utf *utf_java_util_Vector;
90
91 utf *utf_InnerClasses;                  /* InnerClasses                       */
92 utf *utf_ConstantValue;                 /* ConstantValue                      */
93 utf *utf_Code;                          /* Code                               */
94 utf *utf_Exceptions;                    /* Exceptions                         */
95 utf *utf_LineNumberTable;               /* LineNumberTable                    */
96 utf *utf_SourceFile;                    /* SourceFile                         */
97
98 utf *utf_init;                          /* <init>                             */
99 utf *utf_clinit;                        /* <clinit>                           */
100 utf *utf_clone;                         /* clone                              */
101 utf *utf_finalize;                      /* finalize                           */
102 utf *utf_run;                           /* run                                */
103
104 utf *utf_add;                           /* add                                */
105 utf *utf_remove;                        /* remove                             */
106 utf *utf_put;                           /* put                                */
107 utf *utf_get;                           /* get                                */
108 utf *utf_value;                         /* value                              */
109
110 utf *utf_fillInStackTrace;
111 utf *utf_getSystemClassLoader;
112 utf *utf_loadClass;
113 utf *utf_printStackTrace;
114
115 utf *utf_Z;                             /* Z                                  */
116 utf *utf_B;                             /* B                                  */
117 utf *utf_C;                             /* C                                  */
118 utf *utf_S;                             /* S                                  */
119 utf *utf_I;                             /* I                                  */
120 utf *utf_J;                             /* J                                  */
121 utf *utf_F;                             /* F                                  */
122 utf *utf_D;                             /* D                                  */
123
124 utf *utf_void__void;                    /* ()V                                */
125 utf *utf_boolean__void;                 /* (Z)V                               */
126 utf *utf_byte__void;                    /* (B)V                               */
127 utf *utf_char__void;                    /* (C)V                               */
128 utf *utf_short__void;                   /* (S)V                               */
129 utf *utf_int__void;                     /* (I)V                               */
130 utf *utf_long__void;                    /* (J)V                               */
131 utf *utf_float__void;                   /* (F)V                               */
132 utf *utf_double__void;                  /* (D)V                               */
133
134 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
135 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
136 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
137 utf *utf_java_lang_Object__java_lang_Object;
138 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
139 utf *utf_java_lang_String__java_lang_Class;
140 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
141
142 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
143
144 utf *array_packagename;
145
146
147 /* utf_init ********************************************************************
148
149    Initializes the utf8 subsystem.
150
151 *******************************************************************************/
152
153 void utf8_init(void)
154 {
155         /* create utf-symbols for pointer comparison of frequently used strings */
156
157         utf_java_lang_Object           = utf_new_char("java/lang/Object");
158
159         utf_java_lang_Class            = utf_new_char("java/lang/Class");
160         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
161         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
162         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
163         utf_java_lang_String           = utf_new_char("java/lang/String");
164         utf_java_lang_System           = utf_new_char("java/lang/System");
165         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
166         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
167
168         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
169         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
170         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
171
172         utf_java_lang_NoClassDefFoundError =
173                 utf_new_char(string_java_lang_NoClassDefFoundError);
174
175         utf_java_lang_OutOfMemoryError =
176                 utf_new_char(string_java_lang_OutOfMemoryError);
177
178         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
179
180         utf_java_lang_ClassNotFoundException =
181                 utf_new_char(string_java_lang_ClassNotFoundException);
182
183         utf_java_lang_IllegalArgumentException =
184                 utf_new_char(string_java_lang_IllegalArgumentException);
185
186         utf_java_lang_Void             = utf_new_char("java/lang/Void");
187         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
188         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
189         utf_java_lang_Character        = utf_new_char("java/lang/Character");
190         utf_java_lang_Short            = utf_new_char("java/lang/Short");
191         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
192         utf_java_lang_Long             = utf_new_char("java/lang/Long");
193         utf_java_lang_Float            = utf_new_char("java/lang/Float");
194         utf_java_lang_Double           = utf_new_char("java/lang/Double");
195
196         utf_java_lang_StackTraceElement =
197                 utf_new_char("java/lang/StackTraceElement");
198
199         utf_java_lang_reflect_Constructor =
200                 utf_new_char("java/lang/reflect/Constructor");
201
202         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
203         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
204         utf_java_util_Vector           = utf_new_char("java/util/Vector");
205
206         utf_InnerClasses               = utf_new_char("InnerClasses");
207         utf_ConstantValue              = utf_new_char("ConstantValue");
208         utf_Code                       = utf_new_char("Code");
209         utf_Exceptions                 = utf_new_char("Exceptions");
210         utf_LineNumberTable            = utf_new_char("LineNumberTable");
211         utf_SourceFile                 = utf_new_char("SourceFile");
212
213         utf_init                           = utf_new_char("<init>");
214         utf_clinit                         = utf_new_char("<clinit>");
215         utf_clone                      = utf_new_char("clone");
216         utf_finalize                   = utf_new_char("finalize");
217         utf_run                        = utf_new_char("run");
218
219         utf_add                        = utf_new_char("add");
220         utf_remove                     = utf_new_char("remove");
221         utf_put                        = utf_new_char("put");
222         utf_get                        = utf_new_char("get");
223         utf_value                      = utf_new_char("value");
224
225         utf_printStackTrace            = utf_new_char("printStackTrace");
226         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
227         utf_loadClass                  = utf_new_char("loadClass");
228         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
229
230         utf_Z                          = utf_new_char("Z");
231         utf_B                          = utf_new_char("B");
232         utf_C                          = utf_new_char("C");
233         utf_S                          = utf_new_char("S");
234         utf_I                          = utf_new_char("I");
235         utf_J                          = utf_new_char("J");
236         utf_F                          = utf_new_char("F");
237         utf_D                          = utf_new_char("D");
238
239         utf_void__void                 = utf_new_char("()V");
240         utf_boolean__void              = utf_new_char("(Z)V");
241         utf_byte__void                 = utf_new_char("(B)V");
242         utf_char__void                 = utf_new_char("(C)V");
243         utf_short__void                = utf_new_char("(S)V");
244         utf_int__void                  = utf_new_char("(I)V");
245         utf_long__void                 = utf_new_char("(J)V");
246         utf_float__void                = utf_new_char("(F)V");
247         utf_double__void               = utf_new_char("(D)V");
248         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
249         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
250
251         utf_void__java_lang_ClassLoader =
252                 utf_new_char("()Ljava/lang/ClassLoader;");
253
254         utf_java_lang_Object__java_lang_Object =
255                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
256
257         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
258
259         utf_java_lang_String__java_lang_Class =
260                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
261
262         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
263
264         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
265
266         array_packagename              = utf_new_char("\t<the array package>");
267 }
268
269
270 /* utf_hashkey *****************************************************************
271
272    The hashkey is computed from the utf-text by using up to 8
273    characters.  For utf-symbols longer than 15 characters 3 characters
274    are taken from the beginning and the end, 2 characters are taken
275    from the middle.
276
277 *******************************************************************************/
278
279 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
280 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
281
282 u4 utf_hashkey(const char *text, u4 length)
283 {
284         const char *start_pos = text;       /* pointer to utf text                */
285         u4 a;
286
287         switch (length) {
288         case 0: /* empty string */
289                 return 0;
290
291         case 1: return fbs(0);
292         case 2: return fbs(0) ^ nbs(3);
293         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
294         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
295         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
296         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
297         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
298         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
299
300         case 9:
301                 a = fbs(0);
302                 a ^= nbs(1);
303                 a ^= nbs(2);
304                 text++;
305                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
306
307         case 10:
308                 a = fbs(0);
309                 text++;
310                 a ^= nbs(2);
311                 a ^= nbs(3);
312                 a ^= nbs(4);
313                 text++;
314                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
315
316         case 11:
317                 a = fbs(0);
318                 text++;
319                 a ^= nbs(2);
320                 a ^= nbs(3);
321                 a ^= nbs(4);
322                 text++;
323                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
324
325         case 12:
326                 a = fbs(0);
327                 text += 2;
328                 a ^= nbs(2);
329                 a ^= nbs(3);
330                 text++;
331                 a ^= nbs(5);
332                 a ^= nbs(6);
333                 a ^= nbs(7);
334                 text++;
335                 return a ^ nbs(9) ^ nbs(10);
336
337         case 13:
338                 a = fbs(0);
339                 a ^= nbs(1);
340                 text++;
341                 a ^= nbs(3);
342                 a ^= nbs(4);
343                 text += 2;      
344                 a ^= nbs(7);
345                 a ^= nbs(8);
346                 text += 2;
347                 return a ^ nbs(9) ^ nbs(10);
348
349         case 14:
350                 a = fbs(0);
351                 text += 2;      
352                 a ^= nbs(3);
353                 a ^= nbs(4);
354                 text += 2;      
355                 a ^= nbs(7);
356                 a ^= nbs(8);
357                 text += 2;
358                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
359
360         case 15:
361                 a = fbs(0);
362                 text += 2;      
363                 a ^= nbs(3);
364                 a ^= nbs(4);
365                 text += 2;      
366                 a ^= nbs(7);
367                 a ^= nbs(8);
368                 text += 2;
369                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
370
371         default:  /* 3 characters from beginning */
372                 a = fbs(0);
373                 text += 2;
374                 a ^= nbs(3);
375                 a ^= nbs(4);
376
377                 /* 2 characters from middle */
378                 text = start_pos + (length / 2);
379                 a ^= fbs(5);
380                 text += 2;
381                 a ^= nbs(6);    
382
383                 /* 3 characters from end */
384                 text = start_pos + length - 4;
385
386                 a ^= fbs(7);
387                 text++;
388
389                 return a ^ nbs(10) ^ nbs(11);
390     }
391 }
392
393
394 /* utf_hashkey *****************************************************************
395
396    Compute the hashkey of a unicode string.
397
398 *******************************************************************************/
399
400 u4 unicode_hashkey(u2 *text, u2 len)
401 {
402         return utf_hashkey((char *) text, len);
403 }
404
405
406 /* utf_new *********************************************************************
407
408    Creates a new utf-symbol, the text of the symbol is passed as a
409    u1-array. The function searches the utf-hashtable for a utf-symbol
410    with this text. On success the element returned, otherwise a new
411    hashtable element is created.
412
413    If the number of entries in the hashtable exceeds twice the size of
414    the hashtable slots a reorganization of the hashtable is done and
415    the utf symbols are copied to a new hashtable with doubled size.
416
417 *******************************************************************************/
418
419 utf *utf_new_intern(const char *text, u2 length);
420
421 utf *utf_new(const char *text, u2 length)
422 {
423     utf *r;
424
425 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
426     tables_lock();
427 #endif
428
429     r = utf_new_intern(text, length);
430
431 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
432     tables_unlock();
433 #endif
434
435     return r;
436 }
437
438
439 utf *utf_new_intern(const char *text, u2 length)
440 {
441         u4 key;                             /* hashkey computed from utf-text     */
442         u4 slot;                            /* slot in hashtable                  */
443         utf *u;                             /* hashtable element                  */
444         u2 i;
445
446 #ifdef STATISTICS
447         if (opt_stat)
448                 count_utf_new++;
449 #endif
450
451         key  = utf_hashkey(text, length);
452         slot = key & (utf_hash.size - 1);
453         u    = utf_hash.ptr[slot];
454
455         /* search external hash chain for utf-symbol */
456         while (u) {
457                 if (u->blength == length) {
458
459                         /* compare text of hashtable elements */
460                         for (i = 0; i < length; i++)
461                                 if (text[i] != u->text[i]) goto nomatch;
462                         
463 #ifdef STATISTICS
464                         if (opt_stat)
465                                 count_utf_new_found++;
466 #endif
467
468                         /* symbol found in hashtable */
469                         return u;
470                 }
471         nomatch:
472                 u = u->hashlink; /* next element in external chain */
473         }
474
475 #ifdef STATISTICS
476         if (opt_stat)
477                 count_utf_len += sizeof(utf) + length;
478 #endif
479
480         /* location in hashtable found, create new utf element */
481         u = NEW(utf);
482         u->blength  = length;               /* length in bytes of utfstring       */
483         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
484         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
485         memcpy(u->text, text, length);      /* copy utf-text                      */
486         u->text[length] = '\0';
487         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
488
489         utf_hash.entries++;                 /* update number of entries           */
490
491         if (utf_hash.entries > (utf_hash.size * 2)) {
492
493         /* reorganization of hashtable, average length of 
494            the external chains is approx. 2                */  
495
496                 u4 i;
497                 utf *u;
498                 hashtable newhash; /* the new hashtable */
499
500                 /* create new hashtable, double the size */
501                 init_hashtable(&newhash, utf_hash.size * 2);
502                 newhash.entries = utf_hash.entries;
503
504 #ifdef STATISTICS
505                 if (opt_stat)
506                         count_utf_len += sizeof(utf*) * utf_hash.size;
507 #endif
508
509                 /* transfer elements to new hashtable */
510                 for (i = 0; i < utf_hash.size; i++) {
511                         u = (utf *) utf_hash.ptr[i];
512                         while (u) {
513                                 utf *nextu = u->hashlink;
514                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
515                                                 
516                                 u->hashlink = (utf *) newhash.ptr[slot];
517                                 newhash.ptr[slot] = u;
518
519                                 /* follow link in external hash chain */
520                                 u = nextu;
521                         }
522                 }
523         
524                 /* dispose old table */
525                 MFREE(utf_hash.ptr, void*, utf_hash.size);
526                 utf_hash = newhash;
527         }
528
529         return u;
530 }
531
532
533 /* utf_new_u2 ******************************************************************
534
535    Make utf symbol from u2 array, if isclassname is true '.' is
536    replaced by '/'.
537
538 *******************************************************************************/
539
540 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
541 {
542         char *buffer;                   /* memory buffer for  unicode characters  */
543         char *pos;                      /* pointer to current position in buffer  */
544         u4 left;                        /* unicode characters left                */
545         u4 buflength;                   /* utf length in bytes of the u2 array    */
546         utf *result;                    /* resulting utf-string                   */
547         int i;          
548
549         /* determine utf length in bytes and allocate memory */
550
551         buflength = u2_utflength(unicode_pos, unicode_length); 
552         buffer    = MNEW(char, buflength);
553  
554         left = buflength;
555         pos  = buffer;
556
557         for (i = 0; i++ < unicode_length; unicode_pos++) {
558                 /* next unicode character */
559                 u2 c = *unicode_pos;
560                 
561                 if ((c != 0) && (c < 0x80)) {
562                         /* 1 character */       
563                         left--;
564                 if ((int) left < 0) break;
565                         /* convert classname */
566                         if (isclassname && c == '.')
567                                 *pos++ = '/';
568                         else
569                                 *pos++ = (char) c;
570
571                 } else if (c < 0x800) {             
572                         /* 2 characters */                              
573                 unsigned char high = c >> 6;
574                 unsigned char low  = c & 0x3F;
575                         left = left - 2;
576                 if ((int) left < 0) break;
577                 *pos++ = high | 0xC0; 
578                 *pos++ = low  | 0x80;     
579
580                 } else {         
581                 /* 3 characters */                              
582                 char low  = c & 0x3f;
583                 char mid  = (c >> 6) & 0x3F;
584                 char high = c >> 12;
585                         left = left - 3;
586                 if ((int) left < 0) break;
587                 *pos++ = high | 0xE0; 
588                 *pos++ = mid  | 0x80;  
589                 *pos++ = low  | 0x80;   
590                 }
591         }
592         
593         /* insert utf-string into symbol-table */
594         result = utf_new(buffer,buflength);
595
596         MFREE(buffer, char, buflength);
597
598         return result;
599 }
600
601
602 /* utf_new_char ****************************************************************
603
604    Creates a new utf symbol, the text for this symbol is passed as a
605    c-string ( = char* ).
606
607 *******************************************************************************/
608
609 utf *utf_new_char(const char *text)
610 {
611         return utf_new(text, strlen(text));
612 }
613
614
615 /* utf_new_char_classname ******************************************************
616
617    Creates a new utf symbol, the text for this symbol is passed as a
618    c-string ( = char* ) "." characters are going to be replaced by
619    "/". Since the above function is used often, this is a separte
620    function, instead of an if.
621
622 *******************************************************************************/
623
624 utf *utf_new_char_classname(const char *text)
625 {
626         if (strchr(text, '.')) {
627                 char *txt = strdup(text);
628                 char *end = txt + strlen(txt);
629                 char *c;
630                 utf *tmpRes;
631
632                 for (c = txt; c < end; c++)
633                         if (*c == '.') *c = '/';
634
635                 tmpRes = utf_new(txt, strlen(txt));
636                 FREE(txt, 0);
637
638                 return tmpRes;
639
640         } else
641                 return utf_new(text, strlen(text));
642 }
643
644
645 /* utf_nextu2 ******************************************************************
646
647    Read the next unicode character from the utf string and increment
648    the utf-string pointer accordingly.
649
650 *******************************************************************************/
651
652 u2 utf_nextu2(char **utf_ptr)
653 {
654     /* uncompressed unicode character */
655     u2 unicode_char = 0;
656     /* current position in utf text */  
657     unsigned char *utf = (unsigned char *) (*utf_ptr);
658     /* bytes representing the unicode character */
659     unsigned char ch1, ch2, ch3;
660     /* number of bytes used to represent the unicode character */
661     int len = 0;
662         
663     switch ((ch1 = utf[0]) >> 4) {
664         default: /* 1 byte */
665                 (*utf_ptr)++;
666                 return (u2) ch1;
667         case 0xC: 
668         case 0xD: /* 2 bytes */
669                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
670                         unsigned char high = ch1 & 0x1F;
671                         unsigned char low  = ch2 & 0x3F;
672                         unicode_char = (high << 6) + low;
673                         len = 2;
674                 }
675                 break;
676
677         case 0xE: /* 2 or 3 bytes */
678                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
679                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
680                                 unsigned char low  = ch3 & 0x3f;
681                                 unsigned char mid  = ch2 & 0x3f;
682                                 unsigned char high = ch1 & 0x0f;
683                                 unicode_char = (((high << 6) + mid) << 6) + low;
684                                 len = 3;
685                         } else
686                                 len = 2;                                           
687                 }
688                 break;
689     }
690
691     /* update position in utf-text */
692     *utf_ptr = (char *) (utf + len);
693
694     return unicode_char;
695 }
696
697
698 /* utf_strlen ******************************************************************
699
700    Determine number of unicode characters in the utf string.
701
702 *******************************************************************************/
703
704 u4 utf_strlen(utf *u)
705 {
706         char *endpos;                       /* points behind utf string           */
707         char *utf_ptr;                      /* current position in utf text       */
708         u4 len = 0;                         /* number of unicode characters       */
709
710         if (!u) {
711                 *exceptionptr = new_nullpointerexception();
712                 return 0;
713         }
714
715         endpos = UTF_END(u);
716         utf_ptr = u->text;
717
718         while (utf_ptr < endpos) {
719                 len++;
720                 /* next unicode character */
721                 utf_nextu2(&utf_ptr);
722         }
723
724         if (utf_ptr != endpos)
725                 /* string ended abruptly */
726                 throw_cacao_exception_exit(string_java_lang_InternalError,
727                                                                    "Illegal utf8 string");
728
729         return len;
730 }
731
732
733 /* u2_utflength ****************************************************************
734
735    Returns the utf length in bytes of a u2 array.
736
737 *******************************************************************************/
738
739 u4 u2_utflength(u2 *text, u4 u2_length)
740 {
741         u4 result_len = 0;                  /* utf length in bytes                */
742         u2 ch;                              /* current unicode character          */
743         u4 len;
744         
745         for (len = 0; len < u2_length; len++) {
746                 /* next unicode character */
747                 ch = *text++;
748           
749                 /* determine bytes required to store unicode character as utf */
750                 if (ch && (ch < 0x80)) 
751                         result_len++;
752                 else if (ch < 0x800)
753                         result_len += 2;        
754                 else 
755                         result_len += 3;        
756         }
757
758     return result_len;
759 }
760
761
762 /* utf_display *****************************************************************
763
764    Write utf symbol to stdout (for debugging purposes).
765
766 *******************************************************************************/
767
768 void utf_display(utf *u)
769 {
770         char *endpos;                       /* points behind utf string           */
771         char *utf_ptr;                      /* current position in utf text       */
772
773         if (!u) {
774                 printf("NULL");
775                 fflush(stdout);
776                 return;
777         }
778
779         endpos = UTF_END(u);
780         utf_ptr = u->text;
781
782         while (utf_ptr < endpos) {
783                 /* read next unicode character */                
784                 u2 c = utf_nextu2(&utf_ptr);
785                 if (c >= 32 && c <= 127) printf("%c", c);
786                 else printf("?");
787         }
788
789         fflush(stdout);
790 }
791
792
793 /* utf_display_classname *******************************************************
794
795    Write utf symbol to stdout with `/' converted to `.' (for debugging
796    purposes).
797
798 *******************************************************************************/
799
800 void utf_display_classname(utf *u)
801 {
802         char *endpos;                       /* points behind utf string           */
803         char *utf_ptr;                      /* current position in utf text       */
804
805         if (!u) {
806                 printf("NULL");
807                 fflush(stdout);
808                 return;
809         }
810
811         endpos = UTF_END(u);
812         utf_ptr = u->text;
813
814         while (utf_ptr < endpos) {
815                 /* read next unicode character */                
816                 u2 c = utf_nextu2(&utf_ptr);
817                 if (c == '/') c = '.';
818                 if (c >= 32 && c <= 127) printf("%c", c);
819                 else printf("?");
820         }
821
822         fflush(stdout);
823 }
824
825
826 /* utf_sprint ******************************************************************
827         
828    Write utf symbol into c-string (for debugging purposes).
829
830 *******************************************************************************/
831
832 void utf_sprint(char *buffer, utf *u)
833 {
834         char *endpos;                       /* points behind utf string           */
835         char *utf_ptr;                      /* current position in utf text       */
836         u2 pos = 0;                         /* position in c-string               */
837
838         if (!u) {
839                 strcpy(buffer, "NULL");
840                 return;
841         }
842
843         endpos = UTF_END(u);
844         utf_ptr = u->text;
845
846         while (utf_ptr < endpos) 
847                 /* copy next unicode character */       
848                 buffer[pos++] = utf_nextu2(&utf_ptr);
849
850         /* terminate string */
851         buffer[pos] = '\0';
852 }
853
854
855 /* utf_sprint_classname ********************************************************
856         
857    Write utf symbol into c-string with `/' converted to `.' (for debugging
858    purposes).
859
860 *******************************************************************************/
861
862 void utf_sprint_classname(char *buffer, utf *u)
863 {
864         char *endpos;                       /* points behind utf string           */
865         char *utf_ptr;                      /* current position in utf text       */
866         u2 pos = 0;                         /* position in c-string               */
867
868         if (!u) {
869                 strcpy(buffer, "NULL");
870                 return;
871         }
872
873         endpos = UTF_END(u);
874         utf_ptr = u->text;
875
876         while (utf_ptr < endpos) {
877                 /* copy next unicode character */       
878                 u2 c = utf_nextu2(&utf_ptr);
879                 if (c == '/') c = '.';
880                 buffer[pos++] = c;
881         }
882
883         /* terminate string */
884         buffer[pos] = '\0';
885 }
886
887
888 /* utf_strcat ******************************************************************
889         
890    Like libc strcat, but uses an utf8 string.
891
892 *******************************************************************************/
893
894 void utf_strcat(char *buffer, utf *u)
895 {
896         utf_sprint(buffer + strlen(buffer), u);
897 }
898
899
900 /* utf_strcat_classname ********************************************************
901         
902    Like libc strcat, but uses an utf8 string.
903
904 *******************************************************************************/
905
906 void utf_strcat_classname(char *buffer, utf *u)
907 {
908         utf_sprint_classname(buffer + strlen(buffer), u);
909 }
910
911
912 /* utf_fprint ******************************************************************
913         
914    Write utf symbol into file.
915
916 *******************************************************************************/
917
918 void utf_fprint(FILE *file, utf *u)
919 {
920         char *endpos;                       /* points behind utf string           */
921         char *utf_ptr;                      /* current position in utf text       */
922
923         if (!u)
924                 return;
925
926         endpos = UTF_END(u);
927         utf_ptr = u->text;
928
929         while (utf_ptr < endpos) { 
930                 /* read next unicode character */                
931                 u2 c = utf_nextu2(&utf_ptr);                            
932
933                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
934                 else fprintf(file, "?");
935         }
936 }
937
938
939 /* utf_fprint_classname ********************************************************
940         
941    Write utf symbol into file with `/' converted to `.'.
942
943 *******************************************************************************/
944
945 void utf_fprint_classname(FILE *file, utf *u)
946 {
947         char *endpos;                       /* points behind utf string           */
948         char *utf_ptr;                      /* current position in utf text       */
949
950     if (!u)
951                 return;
952
953         endpos = UTF_END(u);
954         utf_ptr = u->text;
955
956         while (utf_ptr < endpos) { 
957                 /* read next unicode character */                
958                 u2 c = utf_nextu2(&utf_ptr);                            
959                 if (c == '/') c = '.';
960
961                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
962                 else fprintf(file, "?");
963         }
964 }
965
966
967 /* is_valid_utf ****************************************************************
968
969    Return true if the given string is a valid UTF-8 string.
970
971    utf_ptr...points to first character
972    end_pos...points after last character
973
974 *******************************************************************************/
975
976 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
977
978 bool is_valid_utf(char *utf_ptr, char *end_pos)
979 {
980         int bytes;
981         int len,i;
982         char c;
983         unsigned long v;
984
985         if (end_pos < utf_ptr) return false;
986         bytes = end_pos - utf_ptr;
987         while (bytes--) {
988                 c = *utf_ptr++;
989
990                 if (!c) return false;                     /* 0x00 is not allowed */
991                 if ((c & 0x80) == 0) continue;            /* ASCII */
992
993                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
994                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
995                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
996                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
997                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
998                 else return false;                        /* invalid leading byte */
999
1000                 if (len > 2) return false;                /* Java limitation */
1001
1002                 v = (unsigned long)c & (0x3f >> len);
1003                 
1004                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1005
1006                 for (i = len; i--; ) {
1007                         c = *utf_ptr++;
1008                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1009                                 return false;
1010                         v = (v << 6) | (c & 0x3f);
1011                 }
1012
1013                 if (v == 0) {
1014                         if (len != 1) return false;           /* Java special */
1015
1016                 } else {
1017                         /* Sun Java seems to allow overlong UTF-8 encodings */
1018                         
1019                         /* if (v < min_codepoint[len]) */
1020                                 /* XXX throw exception? */
1021                 }
1022
1023                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1024                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1025
1026                 /* even these seem to be allowed */
1027                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1028         }
1029
1030         return true;
1031 }
1032
1033
1034 /* is_valid_name ***************************************************************
1035
1036    Return true if the given string may be used as a class/field/method
1037    name. (Currently this only disallows empty strings and control
1038    characters.)
1039
1040    NOTE: The string is assumed to have passed is_valid_utf!
1041
1042    utf_ptr...points to first character
1043    end_pos...points after last character
1044
1045 *******************************************************************************/
1046
1047 bool is_valid_name(char *utf_ptr, char *end_pos)
1048 {
1049         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1050
1051         while (utf_ptr < end_pos) {
1052                 unsigned char c = *utf_ptr++;
1053
1054                 if (c < 0x20) return false; /* disallow control characters */
1055                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1056                         return false;
1057         }
1058
1059         return true;
1060 }
1061
1062 bool is_valid_name_utf(utf *u)
1063 {
1064         return is_valid_name(u->text, UTF_END(u));
1065 }
1066
1067
1068 /* utf_show ********************************************************************
1069
1070    Writes the utf symbols in the utfhash to stdout and displays the
1071    number of external hash chains grouped according to the chainlength
1072    (for debugging purposes).
1073
1074 *******************************************************************************/
1075
1076 void utf_show(void)
1077 {
1078
1079 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1080
1081         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1082         u4 max_chainlength = 0;      /* maximum length of the chains */
1083         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1084         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1085         u4 i;
1086
1087         printf ("UTF-HASH:\n");
1088
1089         /* show element of utf-hashtable */
1090         for (i=0; i<utf_hash.size; i++) {
1091                 utf *u = utf_hash.ptr[i];
1092                 if (u) {
1093                         printf ("SLOT %d: ", (int) i);
1094                         while (u) {
1095                                 printf ("'");
1096                                 utf_display (u);
1097                                 printf ("' ");
1098                                 u = u->hashlink;
1099                         }       
1100                         printf ("\n");
1101                 }
1102                 
1103         }
1104
1105         printf ("UTF-HASH: %d slots for %d entries\n", 
1106                         (int) utf_hash.size, (int) utf_hash.entries );
1107
1108
1109         if (utf_hash.entries == 0)
1110                 return;
1111
1112         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1113
1114         for (i=0;i<CHAIN_LIMIT;i++)
1115                 chain_count[i]=0;
1116
1117         /* count numbers of hashchains according to their length */
1118         for (i=0; i<utf_hash.size; i++) {
1119                   
1120                 utf *u = (utf*) utf_hash.ptr[i];
1121                 u4 chain_length = 0;
1122
1123                 /* determine chainlength */
1124                 while (u) {
1125                         u = u->hashlink;
1126                         chain_length++;
1127                 }
1128
1129                 /* update sum of all chainlengths */
1130                 sum_chainlength+=chain_length;
1131
1132                 /* determine the maximum length of the chains */
1133                 if (chain_length>max_chainlength)
1134                         max_chainlength = chain_length;
1135
1136                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1137                 if (chain_length>=CHAIN_LIMIT) {
1138                         beyond_limit+=chain_length;
1139                         chain_length=CHAIN_LIMIT-1;
1140                 }
1141
1142                 /* update number of hashchains of current length */
1143                 chain_count[chain_length]++;
1144         }
1145
1146         /* display results */  
1147         for (i=1;i<CHAIN_LIMIT-1;i++) 
1148                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1149           
1150         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1151
1152
1153         printf("max. chainlength:%5d\n",max_chainlength);
1154
1155         /* avg. chainlength = sum of chainlengths / number of chains */
1156         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1157 }
1158
1159
1160 /*
1161  * These are local overrides for various environment variables in Emacs.
1162  * Please do not remove this and leave it at the end of the file, where
1163  * Emacs will automagically detect them.
1164  * ---------------------------------------------------------------------
1165  * Local variables:
1166  * mode: c
1167  * indent-tabs-mode: t
1168  * c-basic-offset: 4
1169  * tab-width: 4
1170  * End:
1171  */