* utf_new_intern: Calculate count_utf_len correctly (off by 1).
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 3512 2005-10-28 11:29:52Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_NoClassDefFoundError;
69 utf *utf_java_lang_OutOfMemoryError;
70
71 utf *utf_java_lang_Exception;
72 utf *utf_java_lang_ClassNotFoundException;
73 utf *utf_java_lang_IllegalArgumentException;
74
75 utf *utf_java_lang_NullPointerException;
76
77 utf* utf_java_lang_Void;
78 utf* utf_java_lang_Boolean;
79 utf* utf_java_lang_Byte;
80 utf* utf_java_lang_Character;
81 utf* utf_java_lang_Short;
82 utf* utf_java_lang_Integer;
83 utf* utf_java_lang_Long;
84 utf* utf_java_lang_Float;
85 utf* utf_java_lang_Double;
86
87 utf *utf_java_lang_StackTraceElement;
88 utf *utf_java_lang_reflect_Constructor;
89 utf *utf_java_lang_reflect_Field;
90 utf *utf_java_lang_reflect_Method;
91 utf *utf_java_util_Vector;
92
93 utf *utf_InnerClasses;                  /* InnerClasses                       */
94 utf *utf_ConstantValue;                 /* ConstantValue                      */
95 utf *utf_Code;                          /* Code                               */
96 utf *utf_Exceptions;                    /* Exceptions                         */
97 utf *utf_LineNumberTable;               /* LineNumberTable                    */
98 utf *utf_SourceFile;                    /* SourceFile                         */
99
100 utf *utf_init;                          /* <init>                             */
101 utf *utf_clinit;                        /* <clinit>                           */
102 utf *utf_clone;                         /* clone                              */
103 utf *utf_finalize;                      /* finalize                           */
104 utf *utf_run;                           /* run                                */
105
106 utf *utf_add;                           /* add                                */
107 utf *utf_remove;                        /* remove                             */
108 utf *utf_put;                           /* put                                */
109 utf *utf_get;                           /* get                                */
110 utf *utf_value;                         /* value                              */
111
112 utf *utf_fillInStackTrace;
113 utf *utf_getSystemClassLoader;
114 utf *utf_loadClass;
115 utf *utf_printStackTrace;
116
117 utf *utf_Z;                             /* Z                                  */
118 utf *utf_B;                             /* B                                  */
119 utf *utf_C;                             /* C                                  */
120 utf *utf_S;                             /* S                                  */
121 utf *utf_I;                             /* I                                  */
122 utf *utf_J;                             /* J                                  */
123 utf *utf_F;                             /* F                                  */
124 utf *utf_D;                             /* D                                  */
125
126 utf *utf_void__void;                    /* ()V                                */
127 utf *utf_boolean__void;                 /* (Z)V                               */
128 utf *utf_byte__void;                    /* (B)V                               */
129 utf *utf_char__void;                    /* (C)V                               */
130 utf *utf_short__void;                   /* (S)V                               */
131 utf *utf_int__void;                     /* (I)V                               */
132 utf *utf_long__void;                    /* (J)V                               */
133 utf *utf_float__void;                   /* (F)V                               */
134 utf *utf_double__void;                  /* (D)V                               */
135
136 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
137 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
138 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
139 utf *utf_java_lang_Object__java_lang_Object;
140 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
141 utf *utf_java_lang_String__java_lang_Class;
142 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
143
144 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
145
146 utf *array_packagename;
147
148
149 /* utf_init ********************************************************************
150
151    Initializes the utf8 subsystem.
152
153 *******************************************************************************/
154
155 void utf8_init(void)
156 {
157         /* create utf-symbols for pointer comparison of frequently used strings */
158
159         utf_java_lang_Object           = utf_new_char("java/lang/Object");
160
161         utf_java_lang_Class            = utf_new_char("java/lang/Class");
162         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
163         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
164         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
165         utf_java_lang_String           = utf_new_char("java/lang/String");
166         utf_java_lang_System           = utf_new_char("java/lang/System");
167         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
168         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
169
170         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
171         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
172         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
173
174         utf_java_lang_NoClassDefFoundError =
175                 utf_new_char(string_java_lang_NoClassDefFoundError);
176
177         utf_java_lang_OutOfMemoryError =
178                 utf_new_char(string_java_lang_OutOfMemoryError);
179
180         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
181
182         utf_java_lang_ClassNotFoundException =
183                 utf_new_char(string_java_lang_ClassNotFoundException);
184
185         utf_java_lang_IllegalArgumentException =
186                 utf_new_char(string_java_lang_IllegalArgumentException);
187
188         utf_java_lang_NullPointerException =
189                 utf_new_char(string_java_lang_NullPointerException);
190
191         utf_java_lang_Void             = utf_new_char("java/lang/Void");
192         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
193         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
194         utf_java_lang_Character        = utf_new_char("java/lang/Character");
195         utf_java_lang_Short            = utf_new_char("java/lang/Short");
196         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
197         utf_java_lang_Long             = utf_new_char("java/lang/Long");
198         utf_java_lang_Float            = utf_new_char("java/lang/Float");
199         utf_java_lang_Double           = utf_new_char("java/lang/Double");
200
201         utf_java_lang_StackTraceElement =
202                 utf_new_char("java/lang/StackTraceElement");
203
204         utf_java_lang_reflect_Constructor =
205                 utf_new_char("java/lang/reflect/Constructor");
206
207         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
208         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
209         utf_java_util_Vector           = utf_new_char("java/util/Vector");
210
211         utf_InnerClasses               = utf_new_char("InnerClasses");
212         utf_ConstantValue              = utf_new_char("ConstantValue");
213         utf_Code                       = utf_new_char("Code");
214         utf_Exceptions                 = utf_new_char("Exceptions");
215         utf_LineNumberTable            = utf_new_char("LineNumberTable");
216         utf_SourceFile                 = utf_new_char("SourceFile");
217
218         utf_init                           = utf_new_char("<init>");
219         utf_clinit                         = utf_new_char("<clinit>");
220         utf_clone                      = utf_new_char("clone");
221         utf_finalize                   = utf_new_char("finalize");
222         utf_run                        = utf_new_char("run");
223
224         utf_add                        = utf_new_char("add");
225         utf_remove                     = utf_new_char("remove");
226         utf_put                        = utf_new_char("put");
227         utf_get                        = utf_new_char("get");
228         utf_value                      = utf_new_char("value");
229
230         utf_printStackTrace            = utf_new_char("printStackTrace");
231         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
232         utf_loadClass                  = utf_new_char("loadClass");
233         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
234
235         utf_Z                          = utf_new_char("Z");
236         utf_B                          = utf_new_char("B");
237         utf_C                          = utf_new_char("C");
238         utf_S                          = utf_new_char("S");
239         utf_I                          = utf_new_char("I");
240         utf_J                          = utf_new_char("J");
241         utf_F                          = utf_new_char("F");
242         utf_D                          = utf_new_char("D");
243
244         utf_void__void                 = utf_new_char("()V");
245         utf_boolean__void              = utf_new_char("(Z)V");
246         utf_byte__void                 = utf_new_char("(B)V");
247         utf_char__void                 = utf_new_char("(C)V");
248         utf_short__void                = utf_new_char("(S)V");
249         utf_int__void                  = utf_new_char("(I)V");
250         utf_long__void                 = utf_new_char("(J)V");
251         utf_float__void                = utf_new_char("(F)V");
252         utf_double__void               = utf_new_char("(D)V");
253         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
254         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
255
256         utf_void__java_lang_ClassLoader =
257                 utf_new_char("()Ljava/lang/ClassLoader;");
258
259         utf_java_lang_Object__java_lang_Object =
260                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
261
262         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
263
264         utf_java_lang_String__java_lang_Class =
265                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
266
267         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
268
269         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
270
271         array_packagename              = utf_new_char("\t<the array package>");
272 }
273
274
275 /* utf_hashkey *****************************************************************
276
277    The hashkey is computed from the utf-text by using up to 8
278    characters.  For utf-symbols longer than 15 characters 3 characters
279    are taken from the beginning and the end, 2 characters are taken
280    from the middle.
281
282 *******************************************************************************/
283
284 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
285 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
286
287 u4 utf_hashkey(const char *text, u4 length)
288 {
289         const char *start_pos = text;       /* pointer to utf text                */
290         u4 a;
291
292         switch (length) {
293         case 0: /* empty string */
294                 return 0;
295
296         case 1: return fbs(0);
297         case 2: return fbs(0) ^ nbs(3);
298         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
299         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
300         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
301         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
302         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
303         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
304
305         case 9:
306                 a = fbs(0);
307                 a ^= nbs(1);
308                 a ^= nbs(2);
309                 text++;
310                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
311
312         case 10:
313                 a = fbs(0);
314                 text++;
315                 a ^= nbs(2);
316                 a ^= nbs(3);
317                 a ^= nbs(4);
318                 text++;
319                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
320
321         case 11:
322                 a = fbs(0);
323                 text++;
324                 a ^= nbs(2);
325                 a ^= nbs(3);
326                 a ^= nbs(4);
327                 text++;
328                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
329
330         case 12:
331                 a = fbs(0);
332                 text += 2;
333                 a ^= nbs(2);
334                 a ^= nbs(3);
335                 text++;
336                 a ^= nbs(5);
337                 a ^= nbs(6);
338                 a ^= nbs(7);
339                 text++;
340                 return a ^ nbs(9) ^ nbs(10);
341
342         case 13:
343                 a = fbs(0);
344                 a ^= nbs(1);
345                 text++;
346                 a ^= nbs(3);
347                 a ^= nbs(4);
348                 text += 2;      
349                 a ^= nbs(7);
350                 a ^= nbs(8);
351                 text += 2;
352                 return a ^ nbs(9) ^ nbs(10);
353
354         case 14:
355                 a = fbs(0);
356                 text += 2;      
357                 a ^= nbs(3);
358                 a ^= nbs(4);
359                 text += 2;      
360                 a ^= nbs(7);
361                 a ^= nbs(8);
362                 text += 2;
363                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
364
365         case 15:
366                 a = fbs(0);
367                 text += 2;      
368                 a ^= nbs(3);
369                 a ^= nbs(4);
370                 text += 2;      
371                 a ^= nbs(7);
372                 a ^= nbs(8);
373                 text += 2;
374                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
375
376         default:  /* 3 characters from beginning */
377                 a = fbs(0);
378                 text += 2;
379                 a ^= nbs(3);
380                 a ^= nbs(4);
381
382                 /* 2 characters from middle */
383                 text = start_pos + (length / 2);
384                 a ^= fbs(5);
385                 text += 2;
386                 a ^= nbs(6);    
387
388                 /* 3 characters from end */
389                 text = start_pos + length - 4;
390
391                 a ^= fbs(7);
392                 text++;
393
394                 return a ^ nbs(10) ^ nbs(11);
395     }
396 }
397
398
399 /* utf_hashkey *****************************************************************
400
401    Compute the hashkey of a unicode string.
402
403 *******************************************************************************/
404
405 u4 unicode_hashkey(u2 *text, u2 len)
406 {
407         return utf_hashkey((char *) text, len);
408 }
409
410
411 /* utf_new *********************************************************************
412
413    Creates a new utf-symbol, the text of the symbol is passed as a
414    u1-array. The function searches the utf-hashtable for a utf-symbol
415    with this text. On success the element returned, otherwise a new
416    hashtable element is created.
417
418    If the number of entries in the hashtable exceeds twice the size of
419    the hashtable slots a reorganization of the hashtable is done and
420    the utf symbols are copied to a new hashtable with doubled size.
421
422 *******************************************************************************/
423
424 utf *utf_new_intern(const char *text, u2 length);
425
426 utf *utf_new(const char *text, u2 length)
427 {
428     utf *r;
429
430 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
431     tables_lock();
432 #endif
433
434     r = utf_new_intern(text, length);
435
436 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
437     tables_unlock();
438 #endif
439
440     return r;
441 }
442
443
444 utf *utf_new_intern(const char *text, u2 length)
445 {
446         u4 key;                             /* hashkey computed from utf-text     */
447         u4 slot;                            /* slot in hashtable                  */
448         utf *u;                             /* hashtable element                  */
449         u2 i;
450
451 #ifdef STATISTICS
452         if (opt_stat)
453                 count_utf_new++;
454 #endif
455
456         key  = utf_hashkey(text, length);
457         slot = key & (utf_hash.size - 1);
458         u    = utf_hash.ptr[slot];
459
460         /* search external hash chain for utf-symbol */
461         while (u) {
462                 if (u->blength == length) {
463
464                         /* compare text of hashtable elements */
465                         for (i = 0; i < length; i++)
466                                 if (text[i] != u->text[i]) goto nomatch;
467                         
468 #ifdef STATISTICS
469                         if (opt_stat)
470                                 count_utf_new_found++;
471 #endif
472
473                         /* symbol found in hashtable */
474                         return u;
475                 }
476         nomatch:
477                 u = u->hashlink; /* next element in external chain */
478         }
479
480 #ifdef STATISTICS
481         if (opt_stat)
482                 count_utf_len += sizeof(utf) + length + 1;
483 #endif
484
485         /* location in hashtable found, create new utf element */
486         u = NEW(utf);
487         u->blength  = length;               /* length in bytes of utfstring       */
488         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
489         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
490         memcpy(u->text, text, length);      /* copy utf-text                      */
491         u->text[length] = '\0';
492         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
493
494         utf_hash.entries++;                 /* update number of entries           */
495
496         if (utf_hash.entries > (utf_hash.size * 2)) {
497
498         /* reorganization of hashtable, average length of 
499            the external chains is approx. 2                */  
500
501                 u4 i;
502                 utf *u;
503                 hashtable newhash; /* the new hashtable */
504
505                 /* create new hashtable, double the size */
506                 init_hashtable(&newhash, utf_hash.size * 2);
507                 newhash.entries = utf_hash.entries;
508
509 #ifdef STATISTICS
510                 if (opt_stat)
511                         count_utf_len += sizeof(utf*) * utf_hash.size;
512 #endif
513
514                 /* transfer elements to new hashtable */
515                 for (i = 0; i < utf_hash.size; i++) {
516                         u = (utf *) utf_hash.ptr[i];
517                         while (u) {
518                                 utf *nextu = u->hashlink;
519                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
520                                                 
521                                 u->hashlink = (utf *) newhash.ptr[slot];
522                                 newhash.ptr[slot] = u;
523
524                                 /* follow link in external hash chain */
525                                 u = nextu;
526                         }
527                 }
528         
529                 /* dispose old table */
530                 MFREE(utf_hash.ptr, void*, utf_hash.size);
531                 utf_hash = newhash;
532         }
533
534         return u;
535 }
536
537
538 /* utf_new_u2 ******************************************************************
539
540    Make utf symbol from u2 array, if isclassname is true '.' is
541    replaced by '/'.
542
543 *******************************************************************************/
544
545 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
546 {
547         char *buffer;                   /* memory buffer for  unicode characters  */
548         char *pos;                      /* pointer to current position in buffer  */
549         u4 left;                        /* unicode characters left                */
550         u4 buflength;                   /* utf length in bytes of the u2 array    */
551         utf *result;                    /* resulting utf-string                   */
552         int i;          
553
554         /* determine utf length in bytes and allocate memory */
555
556         buflength = u2_utflength(unicode_pos, unicode_length); 
557         buffer    = MNEW(char, buflength);
558  
559         left = buflength;
560         pos  = buffer;
561
562         for (i = 0; i++ < unicode_length; unicode_pos++) {
563                 /* next unicode character */
564                 u2 c = *unicode_pos;
565                 
566                 if ((c != 0) && (c < 0x80)) {
567                         /* 1 character */       
568                         left--;
569                 if ((int) left < 0) break;
570                         /* convert classname */
571                         if (isclassname && c == '.')
572                                 *pos++ = '/';
573                         else
574                                 *pos++ = (char) c;
575
576                 } else if (c < 0x800) {             
577                         /* 2 characters */                              
578                 unsigned char high = c >> 6;
579                 unsigned char low  = c & 0x3F;
580                         left = left - 2;
581                 if ((int) left < 0) break;
582                 *pos++ = high | 0xC0; 
583                 *pos++ = low  | 0x80;     
584
585                 } else {         
586                 /* 3 characters */                              
587                 char low  = c & 0x3f;
588                 char mid  = (c >> 6) & 0x3F;
589                 char high = c >> 12;
590                         left = left - 3;
591                 if ((int) left < 0) break;
592                 *pos++ = high | 0xE0; 
593                 *pos++ = mid  | 0x80;  
594                 *pos++ = low  | 0x80;   
595                 }
596         }
597         
598         /* insert utf-string into symbol-table */
599         result = utf_new(buffer,buflength);
600
601         MFREE(buffer, char, buflength);
602
603         return result;
604 }
605
606
607 /* utf_new_char ****************************************************************
608
609    Creates a new utf symbol, the text for this symbol is passed as a
610    c-string ( = char* ).
611
612 *******************************************************************************/
613
614 utf *utf_new_char(const char *text)
615 {
616         return utf_new(text, strlen(text));
617 }
618
619
620 /* utf_new_char_classname ******************************************************
621
622    Creates a new utf symbol, the text for this symbol is passed as a
623    c-string ( = char* ) "." characters are going to be replaced by
624    "/". Since the above function is used often, this is a separte
625    function, instead of an if.
626
627 *******************************************************************************/
628
629 utf *utf_new_char_classname(const char *text)
630 {
631         if (strchr(text, '.')) {
632                 char *txt = strdup(text);
633                 char *end = txt + strlen(txt);
634                 char *c;
635                 utf *tmpRes;
636
637                 for (c = txt; c < end; c++)
638                         if (*c == '.') *c = '/';
639
640                 tmpRes = utf_new(txt, strlen(txt));
641                 FREE(txt, 0);
642
643                 return tmpRes;
644
645         } else
646                 return utf_new(text, strlen(text));
647 }
648
649
650 /* utf_nextu2 ******************************************************************
651
652    Read the next unicode character from the utf string and increment
653    the utf-string pointer accordingly.
654
655 *******************************************************************************/
656
657 u2 utf_nextu2(char **utf_ptr)
658 {
659     /* uncompressed unicode character */
660     u2 unicode_char = 0;
661     /* current position in utf text */  
662     unsigned char *utf = (unsigned char *) (*utf_ptr);
663     /* bytes representing the unicode character */
664     unsigned char ch1, ch2, ch3;
665     /* number of bytes used to represent the unicode character */
666     int len = 0;
667         
668     switch ((ch1 = utf[0]) >> 4) {
669         default: /* 1 byte */
670                 (*utf_ptr)++;
671                 return (u2) ch1;
672         case 0xC: 
673         case 0xD: /* 2 bytes */
674                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
675                         unsigned char high = ch1 & 0x1F;
676                         unsigned char low  = ch2 & 0x3F;
677                         unicode_char = (high << 6) + low;
678                         len = 2;
679                 }
680                 break;
681
682         case 0xE: /* 2 or 3 bytes */
683                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
684                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
685                                 unsigned char low  = ch3 & 0x3f;
686                                 unsigned char mid  = ch2 & 0x3f;
687                                 unsigned char high = ch1 & 0x0f;
688                                 unicode_char = (((high << 6) + mid) << 6) + low;
689                                 len = 3;
690                         } else
691                                 len = 2;                                           
692                 }
693                 break;
694     }
695
696     /* update position in utf-text */
697     *utf_ptr = (char *) (utf + len);
698
699     return unicode_char;
700 }
701
702
703 /* utf_strlen ******************************************************************
704
705    Determine number of unicode characters in the utf string.
706
707 *******************************************************************************/
708
709 u4 utf_strlen(utf *u)
710 {
711         char *endpos;                       /* points behind utf string           */
712         char *utf_ptr;                      /* current position in utf text       */
713         u4 len = 0;                         /* number of unicode characters       */
714
715         if (!u) {
716                 *exceptionptr = new_nullpointerexception();
717                 return 0;
718         }
719
720         endpos = UTF_END(u);
721         utf_ptr = u->text;
722
723         while (utf_ptr < endpos) {
724                 len++;
725                 /* next unicode character */
726                 utf_nextu2(&utf_ptr);
727         }
728
729         if (utf_ptr != endpos)
730                 /* string ended abruptly */
731                 throw_cacao_exception_exit(string_java_lang_InternalError,
732                                                                    "Illegal utf8 string");
733
734         return len;
735 }
736
737
738 /* u2_utflength ****************************************************************
739
740    Returns the utf length in bytes of a u2 array.
741
742 *******************************************************************************/
743
744 u4 u2_utflength(u2 *text, u4 u2_length)
745 {
746         u4 result_len = 0;                  /* utf length in bytes                */
747         u2 ch;                              /* current unicode character          */
748         u4 len;
749         
750         for (len = 0; len < u2_length; len++) {
751                 /* next unicode character */
752                 ch = *text++;
753           
754                 /* determine bytes required to store unicode character as utf */
755                 if (ch && (ch < 0x80)) 
756                         result_len++;
757                 else if (ch < 0x800)
758                         result_len += 2;        
759                 else 
760                         result_len += 3;        
761         }
762
763     return result_len;
764 }
765
766
767 /* utf_display *****************************************************************
768
769    Write utf symbol to stdout (for debugging purposes).
770
771 *******************************************************************************/
772
773 void utf_display(utf *u)
774 {
775         char *endpos;                       /* points behind utf string           */
776         char *utf_ptr;                      /* current position in utf text       */
777
778         if (!u) {
779                 printf("NULL");
780                 fflush(stdout);
781                 return;
782         }
783
784         endpos = UTF_END(u);
785         utf_ptr = u->text;
786
787         while (utf_ptr < endpos) {
788                 /* read next unicode character */                
789                 u2 c = utf_nextu2(&utf_ptr);
790                 if (c >= 32 && c <= 127) printf("%c", c);
791                 else printf("?");
792         }
793
794         fflush(stdout);
795 }
796
797
798 /* utf_display_classname *******************************************************
799
800    Write utf symbol to stdout with `/' converted to `.' (for debugging
801    purposes).
802
803 *******************************************************************************/
804
805 void utf_display_classname(utf *u)
806 {
807         char *endpos;                       /* points behind utf string           */
808         char *utf_ptr;                      /* current position in utf text       */
809
810         if (!u) {
811                 printf("NULL");
812                 fflush(stdout);
813                 return;
814         }
815
816         endpos = UTF_END(u);
817         utf_ptr = u->text;
818
819         while (utf_ptr < endpos) {
820                 /* read next unicode character */                
821                 u2 c = utf_nextu2(&utf_ptr);
822                 if (c == '/') c = '.';
823                 if (c >= 32 && c <= 127) printf("%c", c);
824                 else printf("?");
825         }
826
827         fflush(stdout);
828 }
829
830
831 /* utf_sprint ******************************************************************
832         
833    Write utf symbol into c-string (for debugging purposes).
834
835 *******************************************************************************/
836
837 void utf_sprint(char *buffer, utf *u)
838 {
839         char *endpos;                       /* points behind utf string           */
840         char *utf_ptr;                      /* current position in utf text       */
841         u2 pos = 0;                         /* position in c-string               */
842
843         if (!u) {
844                 strcpy(buffer, "NULL");
845                 return;
846         }
847
848         endpos = UTF_END(u);
849         utf_ptr = u->text;
850
851         while (utf_ptr < endpos) 
852                 /* copy next unicode character */       
853                 buffer[pos++] = utf_nextu2(&utf_ptr);
854
855         /* terminate string */
856         buffer[pos] = '\0';
857 }
858
859
860 /* utf_sprint_classname ********************************************************
861         
862    Write utf symbol into c-string with `/' converted to `.' (for debugging
863    purposes).
864
865 *******************************************************************************/
866
867 void utf_sprint_classname(char *buffer, utf *u)
868 {
869         char *endpos;                       /* points behind utf string           */
870         char *utf_ptr;                      /* current position in utf text       */
871         u2 pos = 0;                         /* position in c-string               */
872
873         if (!u) {
874                 strcpy(buffer, "NULL");
875                 return;
876         }
877
878         endpos = UTF_END(u);
879         utf_ptr = u->text;
880
881         while (utf_ptr < endpos) {
882                 /* copy next unicode character */       
883                 u2 c = utf_nextu2(&utf_ptr);
884                 if (c == '/') c = '.';
885                 buffer[pos++] = c;
886         }
887
888         /* terminate string */
889         buffer[pos] = '\0';
890 }
891
892
893 /* utf_strcat ******************************************************************
894         
895    Like libc strcat, but uses an utf8 string.
896
897 *******************************************************************************/
898
899 void utf_strcat(char *buffer, utf *u)
900 {
901         utf_sprint(buffer + strlen(buffer), u);
902 }
903
904
905 /* utf_strcat_classname ********************************************************
906         
907    Like libc strcat, but uses an utf8 string.
908
909 *******************************************************************************/
910
911 void utf_strcat_classname(char *buffer, utf *u)
912 {
913         utf_sprint_classname(buffer + strlen(buffer), u);
914 }
915
916
917 /* utf_fprint ******************************************************************
918         
919    Write utf symbol into file.
920
921 *******************************************************************************/
922
923 void utf_fprint(FILE *file, utf *u)
924 {
925         char *endpos;                       /* points behind utf string           */
926         char *utf_ptr;                      /* current position in utf text       */
927
928         if (!u)
929                 return;
930
931         endpos = UTF_END(u);
932         utf_ptr = u->text;
933
934         while (utf_ptr < endpos) { 
935                 /* read next unicode character */                
936                 u2 c = utf_nextu2(&utf_ptr);                            
937
938                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
939                 else fprintf(file, "?");
940         }
941 }
942
943
944 /* utf_fprint_classname ********************************************************
945         
946    Write utf symbol into file with `/' converted to `.'.
947
948 *******************************************************************************/
949
950 void utf_fprint_classname(FILE *file, utf *u)
951 {
952         char *endpos;                       /* points behind utf string           */
953         char *utf_ptr;                      /* current position in utf text       */
954
955     if (!u)
956                 return;
957
958         endpos = UTF_END(u);
959         utf_ptr = u->text;
960
961         while (utf_ptr < endpos) { 
962                 /* read next unicode character */                
963                 u2 c = utf_nextu2(&utf_ptr);                            
964                 if (c == '/') c = '.';
965
966                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
967                 else fprintf(file, "?");
968         }
969 }
970
971
972 /* is_valid_utf ****************************************************************
973
974    Return true if the given string is a valid UTF-8 string.
975
976    utf_ptr...points to first character
977    end_pos...points after last character
978
979 *******************************************************************************/
980
981 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
982
983 bool is_valid_utf(char *utf_ptr, char *end_pos)
984 {
985         int bytes;
986         int len,i;
987         char c;
988         unsigned long v;
989
990         if (end_pos < utf_ptr) return false;
991         bytes = end_pos - utf_ptr;
992         while (bytes--) {
993                 c = *utf_ptr++;
994
995                 if (!c) return false;                     /* 0x00 is not allowed */
996                 if ((c & 0x80) == 0) continue;            /* ASCII */
997
998                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
999                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1000                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1001                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1002                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1003                 else return false;                        /* invalid leading byte */
1004
1005                 if (len > 2) return false;                /* Java limitation */
1006
1007                 v = (unsigned long)c & (0x3f >> len);
1008                 
1009                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1010
1011                 for (i = len; i--; ) {
1012                         c = *utf_ptr++;
1013                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1014                                 return false;
1015                         v = (v << 6) | (c & 0x3f);
1016                 }
1017
1018                 if (v == 0) {
1019                         if (len != 1) return false;           /* Java special */
1020
1021                 } else {
1022                         /* Sun Java seems to allow overlong UTF-8 encodings */
1023                         
1024                         /* if (v < min_codepoint[len]) */
1025                                 /* XXX throw exception? */
1026                 }
1027
1028                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1029                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1030
1031                 /* even these seem to be allowed */
1032                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1033         }
1034
1035         return true;
1036 }
1037
1038
1039 /* is_valid_name ***************************************************************
1040
1041    Return true if the given string may be used as a class/field/method
1042    name. (Currently this only disallows empty strings and control
1043    characters.)
1044
1045    NOTE: The string is assumed to have passed is_valid_utf!
1046
1047    utf_ptr...points to first character
1048    end_pos...points after last character
1049
1050 *******************************************************************************/
1051
1052 bool is_valid_name(char *utf_ptr, char *end_pos)
1053 {
1054         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1055
1056         while (utf_ptr < end_pos) {
1057                 unsigned char c = *utf_ptr++;
1058
1059                 if (c < 0x20) return false; /* disallow control characters */
1060                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1061                         return false;
1062         }
1063
1064         return true;
1065 }
1066
1067 bool is_valid_name_utf(utf *u)
1068 {
1069         return is_valid_name(u->text, UTF_END(u));
1070 }
1071
1072
1073 /* utf_show ********************************************************************
1074
1075    Writes the utf symbols in the utfhash to stdout and displays the
1076    number of external hash chains grouped according to the chainlength
1077    (for debugging purposes).
1078
1079 *******************************************************************************/
1080
1081 void utf_show(void)
1082 {
1083
1084 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1085
1086         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1087         u4 max_chainlength = 0;      /* maximum length of the chains */
1088         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1089         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1090         u4 i;
1091
1092         printf ("UTF-HASH:\n");
1093
1094         /* show element of utf-hashtable */
1095         for (i=0; i<utf_hash.size; i++) {
1096                 utf *u = utf_hash.ptr[i];
1097                 if (u) {
1098                         printf ("SLOT %d: ", (int) i);
1099                         while (u) {
1100                                 printf ("'");
1101                                 utf_display (u);
1102                                 printf ("' ");
1103                                 u = u->hashlink;
1104                         }       
1105                         printf ("\n");
1106                 }
1107                 
1108         }
1109
1110         printf ("UTF-HASH: %d slots for %d entries\n", 
1111                         (int) utf_hash.size, (int) utf_hash.entries );
1112
1113
1114         if (utf_hash.entries == 0)
1115                 return;
1116
1117         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1118
1119         for (i=0;i<CHAIN_LIMIT;i++)
1120                 chain_count[i]=0;
1121
1122         /* count numbers of hashchains according to their length */
1123         for (i=0; i<utf_hash.size; i++) {
1124                   
1125                 utf *u = (utf*) utf_hash.ptr[i];
1126                 u4 chain_length = 0;
1127
1128                 /* determine chainlength */
1129                 while (u) {
1130                         u = u->hashlink;
1131                         chain_length++;
1132                 }
1133
1134                 /* update sum of all chainlengths */
1135                 sum_chainlength+=chain_length;
1136
1137                 /* determine the maximum length of the chains */
1138                 if (chain_length>max_chainlength)
1139                         max_chainlength = chain_length;
1140
1141                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1142                 if (chain_length>=CHAIN_LIMIT) {
1143                         beyond_limit+=chain_length;
1144                         chain_length=CHAIN_LIMIT-1;
1145                 }
1146
1147                 /* update number of hashchains of current length */
1148                 chain_count[chain_length]++;
1149         }
1150
1151         /* display results */  
1152         for (i=1;i<CHAIN_LIMIT-1;i++) 
1153                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1154           
1155         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1156
1157
1158         printf("max. chainlength:%5d\n",max_chainlength);
1159
1160         /* avg. chainlength = sum of chainlengths / number of chains */
1161         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1162 }
1163
1164
1165 /*
1166  * These are local overrides for various environment variables in Emacs.
1167  * Please do not remove this and leave it at the end of the file, where
1168  * Emacs will automagically detect them.
1169  * ---------------------------------------------------------------------
1170  * Local variables:
1171  * mode: c
1172  * indent-tabs-mode: t
1173  * c-basic-offset: 4
1174  * tab-width: 4
1175  * End:
1176  */