* utf_java_lang_IllegalMonitorStateException: Added.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 3541 2005-11-03 20:33:51Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_NoClassDefFoundError;
69 utf *utf_java_lang_OutOfMemoryError;
70
71 utf *utf_java_lang_Exception;
72 utf *utf_java_lang_ClassNotFoundException;
73 utf *utf_java_lang_IllegalArgumentException;
74 utf *utf_java_lang_IllegalMonitorStateException;
75
76 utf *utf_java_lang_NullPointerException;
77
78 utf* utf_java_lang_Void;
79 utf* utf_java_lang_Boolean;
80 utf* utf_java_lang_Byte;
81 utf* utf_java_lang_Character;
82 utf* utf_java_lang_Short;
83 utf* utf_java_lang_Integer;
84 utf* utf_java_lang_Long;
85 utf* utf_java_lang_Float;
86 utf* utf_java_lang_Double;
87
88 utf *utf_java_lang_StackTraceElement;
89 utf *utf_java_lang_reflect_Constructor;
90 utf *utf_java_lang_reflect_Field;
91 utf *utf_java_lang_reflect_Method;
92 utf *utf_java_util_Vector;
93
94 utf *utf_InnerClasses;                  /* InnerClasses                       */
95 utf *utf_ConstantValue;                 /* ConstantValue                      */
96 utf *utf_Code;                          /* Code                               */
97 utf *utf_Exceptions;                    /* Exceptions                         */
98 utf *utf_LineNumberTable;               /* LineNumberTable                    */
99 utf *utf_SourceFile;                    /* SourceFile                         */
100
101 utf *utf_init;                          /* <init>                             */
102 utf *utf_clinit;                        /* <clinit>                           */
103 utf *utf_clone;                         /* clone                              */
104 utf *utf_finalize;                      /* finalize                           */
105 utf *utf_run;                           /* run                                */
106
107 utf *utf_add;                           /* add                                */
108 utf *utf_remove;                        /* remove                             */
109 utf *utf_put;                           /* put                                */
110 utf *utf_get;                           /* get                                */
111 utf *utf_value;                         /* value                              */
112
113 utf *utf_fillInStackTrace;
114 utf *utf_getSystemClassLoader;
115 utf *utf_loadClass;
116 utf *utf_printStackTrace;
117
118 utf *utf_Z;                             /* Z                                  */
119 utf *utf_B;                             /* B                                  */
120 utf *utf_C;                             /* C                                  */
121 utf *utf_S;                             /* S                                  */
122 utf *utf_I;                             /* I                                  */
123 utf *utf_J;                             /* J                                  */
124 utf *utf_F;                             /* F                                  */
125 utf *utf_D;                             /* D                                  */
126
127 utf *utf_void__void;                    /* ()V                                */
128 utf *utf_boolean__void;                 /* (Z)V                               */
129 utf *utf_byte__void;                    /* (B)V                               */
130 utf *utf_char__void;                    /* (C)V                               */
131 utf *utf_short__void;                   /* (S)V                               */
132 utf *utf_int__void;                     /* (I)V                               */
133 utf *utf_long__void;                    /* (J)V                               */
134 utf *utf_float__void;                   /* (F)V                               */
135 utf *utf_double__void;                  /* (D)V                               */
136
137 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
138 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
139 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
140 utf *utf_java_lang_Object__java_lang_Object;
141 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
142 utf *utf_java_lang_String__java_lang_Class;
143 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
144
145 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
146
147 utf *array_packagename;
148
149
150 /* utf_init ********************************************************************
151
152    Initializes the utf8 subsystem.
153
154 *******************************************************************************/
155
156 void utf8_init(void)
157 {
158         /* create utf-symbols for pointer comparison of frequently used strings */
159
160         utf_java_lang_Object           = utf_new_char("java/lang/Object");
161
162         utf_java_lang_Class            = utf_new_char("java/lang/Class");
163         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
164         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
165         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
166         utf_java_lang_String           = utf_new_char("java/lang/String");
167         utf_java_lang_System           = utf_new_char("java/lang/System");
168         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
169         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
170
171         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
172         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
173         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
174
175         utf_java_lang_NoClassDefFoundError =
176                 utf_new_char(string_java_lang_NoClassDefFoundError);
177
178         utf_java_lang_OutOfMemoryError =
179                 utf_new_char(string_java_lang_OutOfMemoryError);
180
181         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
182
183         utf_java_lang_ClassNotFoundException =
184                 utf_new_char(string_java_lang_ClassNotFoundException);
185
186         utf_java_lang_IllegalArgumentException =
187                 utf_new_char(string_java_lang_IllegalArgumentException);
188
189         utf_java_lang_IllegalMonitorStateException =
190                 utf_new_char(string_java_lang_IllegalMonitorStateException);
191
192         utf_java_lang_NullPointerException =
193                 utf_new_char(string_java_lang_NullPointerException);
194
195         utf_java_lang_Void             = utf_new_char("java/lang/Void");
196         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
197         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
198         utf_java_lang_Character        = utf_new_char("java/lang/Character");
199         utf_java_lang_Short            = utf_new_char("java/lang/Short");
200         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
201         utf_java_lang_Long             = utf_new_char("java/lang/Long");
202         utf_java_lang_Float            = utf_new_char("java/lang/Float");
203         utf_java_lang_Double           = utf_new_char("java/lang/Double");
204
205         utf_java_lang_StackTraceElement =
206                 utf_new_char("java/lang/StackTraceElement");
207
208         utf_java_lang_reflect_Constructor =
209                 utf_new_char("java/lang/reflect/Constructor");
210
211         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
212         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
213         utf_java_util_Vector           = utf_new_char("java/util/Vector");
214
215         utf_InnerClasses               = utf_new_char("InnerClasses");
216         utf_ConstantValue              = utf_new_char("ConstantValue");
217         utf_Code                       = utf_new_char("Code");
218         utf_Exceptions                 = utf_new_char("Exceptions");
219         utf_LineNumberTable            = utf_new_char("LineNumberTable");
220         utf_SourceFile                 = utf_new_char("SourceFile");
221
222         utf_init                           = utf_new_char("<init>");
223         utf_clinit                         = utf_new_char("<clinit>");
224         utf_clone                      = utf_new_char("clone");
225         utf_finalize                   = utf_new_char("finalize");
226         utf_run                        = utf_new_char("run");
227
228         utf_add                        = utf_new_char("add");
229         utf_remove                     = utf_new_char("remove");
230         utf_put                        = utf_new_char("put");
231         utf_get                        = utf_new_char("get");
232         utf_value                      = utf_new_char("value");
233
234         utf_printStackTrace            = utf_new_char("printStackTrace");
235         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
236         utf_loadClass                  = utf_new_char("loadClass");
237         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
238
239         utf_Z                          = utf_new_char("Z");
240         utf_B                          = utf_new_char("B");
241         utf_C                          = utf_new_char("C");
242         utf_S                          = utf_new_char("S");
243         utf_I                          = utf_new_char("I");
244         utf_J                          = utf_new_char("J");
245         utf_F                          = utf_new_char("F");
246         utf_D                          = utf_new_char("D");
247
248         utf_void__void                 = utf_new_char("()V");
249         utf_boolean__void              = utf_new_char("(Z)V");
250         utf_byte__void                 = utf_new_char("(B)V");
251         utf_char__void                 = utf_new_char("(C)V");
252         utf_short__void                = utf_new_char("(S)V");
253         utf_int__void                  = utf_new_char("(I)V");
254         utf_long__void                 = utf_new_char("(J)V");
255         utf_float__void                = utf_new_char("(F)V");
256         utf_double__void               = utf_new_char("(D)V");
257         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
258         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
259
260         utf_void__java_lang_ClassLoader =
261                 utf_new_char("()Ljava/lang/ClassLoader;");
262
263         utf_java_lang_Object__java_lang_Object =
264                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
265
266         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
267
268         utf_java_lang_String__java_lang_Class =
269                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
270
271         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
272
273         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
274
275         array_packagename              = utf_new_char("\t<the array package>");
276 }
277
278
279 /* utf_hashkey *****************************************************************
280
281    The hashkey is computed from the utf-text by using up to 8
282    characters.  For utf-symbols longer than 15 characters 3 characters
283    are taken from the beginning and the end, 2 characters are taken
284    from the middle.
285
286 *******************************************************************************/
287
288 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
289 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
290
291 u4 utf_hashkey(const char *text, u4 length)
292 {
293         const char *start_pos = text;       /* pointer to utf text                */
294         u4 a;
295
296         switch (length) {
297         case 0: /* empty string */
298                 return 0;
299
300         case 1: return fbs(0);
301         case 2: return fbs(0) ^ nbs(3);
302         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
303         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
304         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
305         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
306         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
307         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
308
309         case 9:
310                 a = fbs(0);
311                 a ^= nbs(1);
312                 a ^= nbs(2);
313                 text++;
314                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
315
316         case 10:
317                 a = fbs(0);
318                 text++;
319                 a ^= nbs(2);
320                 a ^= nbs(3);
321                 a ^= nbs(4);
322                 text++;
323                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
324
325         case 11:
326                 a = fbs(0);
327                 text++;
328                 a ^= nbs(2);
329                 a ^= nbs(3);
330                 a ^= nbs(4);
331                 text++;
332                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
333
334         case 12:
335                 a = fbs(0);
336                 text += 2;
337                 a ^= nbs(2);
338                 a ^= nbs(3);
339                 text++;
340                 a ^= nbs(5);
341                 a ^= nbs(6);
342                 a ^= nbs(7);
343                 text++;
344                 return a ^ nbs(9) ^ nbs(10);
345
346         case 13:
347                 a = fbs(0);
348                 a ^= nbs(1);
349                 text++;
350                 a ^= nbs(3);
351                 a ^= nbs(4);
352                 text += 2;      
353                 a ^= nbs(7);
354                 a ^= nbs(8);
355                 text += 2;
356                 return a ^ nbs(9) ^ nbs(10);
357
358         case 14:
359                 a = fbs(0);
360                 text += 2;      
361                 a ^= nbs(3);
362                 a ^= nbs(4);
363                 text += 2;      
364                 a ^= nbs(7);
365                 a ^= nbs(8);
366                 text += 2;
367                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
368
369         case 15:
370                 a = fbs(0);
371                 text += 2;      
372                 a ^= nbs(3);
373                 a ^= nbs(4);
374                 text += 2;      
375                 a ^= nbs(7);
376                 a ^= nbs(8);
377                 text += 2;
378                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
379
380         default:  /* 3 characters from beginning */
381                 a = fbs(0);
382                 text += 2;
383                 a ^= nbs(3);
384                 a ^= nbs(4);
385
386                 /* 2 characters from middle */
387                 text = start_pos + (length / 2);
388                 a ^= fbs(5);
389                 text += 2;
390                 a ^= nbs(6);    
391
392                 /* 3 characters from end */
393                 text = start_pos + length - 4;
394
395                 a ^= fbs(7);
396                 text++;
397
398                 return a ^ nbs(10) ^ nbs(11);
399     }
400 }
401
402
403 /* utf_hashkey *****************************************************************
404
405    Compute the hashkey of a unicode string.
406
407 *******************************************************************************/
408
409 u4 unicode_hashkey(u2 *text, u2 len)
410 {
411         return utf_hashkey((char *) text, len);
412 }
413
414
415 /* utf_new *********************************************************************
416
417    Creates a new utf-symbol, the text of the symbol is passed as a
418    u1-array. The function searches the utf-hashtable for a utf-symbol
419    with this text. On success the element returned, otherwise a new
420    hashtable element is created.
421
422    If the number of entries in the hashtable exceeds twice the size of
423    the hashtable slots a reorganization of the hashtable is done and
424    the utf symbols are copied to a new hashtable with doubled size.
425
426 *******************************************************************************/
427
428 utf *utf_new_intern(const char *text, u2 length);
429
430 utf *utf_new(const char *text, u2 length)
431 {
432     utf *r;
433
434 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
435     tables_lock();
436 #endif
437
438     r = utf_new_intern(text, length);
439
440 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
441     tables_unlock();
442 #endif
443
444     return r;
445 }
446
447
448 utf *utf_new_intern(const char *text, u2 length)
449 {
450         u4 key;                             /* hashkey computed from utf-text     */
451         u4 slot;                            /* slot in hashtable                  */
452         utf *u;                             /* hashtable element                  */
453         u2 i;
454
455 #ifdef STATISTICS
456         if (opt_stat)
457                 count_utf_new++;
458 #endif
459
460         key  = utf_hashkey(text, length);
461         slot = key & (utf_hash.size - 1);
462         u    = utf_hash.ptr[slot];
463
464         /* search external hash chain for utf-symbol */
465         while (u) {
466                 if (u->blength == length) {
467
468                         /* compare text of hashtable elements */
469                         for (i = 0; i < length; i++)
470                                 if (text[i] != u->text[i]) goto nomatch;
471                         
472 #ifdef STATISTICS
473                         if (opt_stat)
474                                 count_utf_new_found++;
475 #endif
476
477                         /* symbol found in hashtable */
478                         return u;
479                 }
480         nomatch:
481                 u = u->hashlink; /* next element in external chain */
482         }
483
484 #ifdef STATISTICS
485         if (opt_stat)
486                 count_utf_len += sizeof(utf) + length + 1;
487 #endif
488
489         /* location in hashtable found, create new utf element */
490         u = NEW(utf);
491         u->blength  = length;               /* length in bytes of utfstring       */
492         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
493         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
494         memcpy(u->text, text, length);      /* copy utf-text                      */
495         u->text[length] = '\0';
496         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
497
498         utf_hash.entries++;                 /* update number of entries           */
499
500         if (utf_hash.entries > (utf_hash.size * 2)) {
501
502         /* reorganization of hashtable, average length of 
503            the external chains is approx. 2                */  
504
505                 u4 i;
506                 utf *u;
507                 hashtable newhash; /* the new hashtable */
508
509                 /* create new hashtable, double the size */
510                 init_hashtable(&newhash, utf_hash.size * 2);
511                 newhash.entries = utf_hash.entries;
512
513 #ifdef STATISTICS
514                 if (opt_stat)
515                         count_utf_len += sizeof(utf*) * utf_hash.size;
516 #endif
517
518                 /* transfer elements to new hashtable */
519                 for (i = 0; i < utf_hash.size; i++) {
520                         u = (utf *) utf_hash.ptr[i];
521                         while (u) {
522                                 utf *nextu = u->hashlink;
523                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
524                                                 
525                                 u->hashlink = (utf *) newhash.ptr[slot];
526                                 newhash.ptr[slot] = u;
527
528                                 /* follow link in external hash chain */
529                                 u = nextu;
530                         }
531                 }
532         
533                 /* dispose old table */
534                 MFREE(utf_hash.ptr, void*, utf_hash.size);
535                 utf_hash = newhash;
536         }
537
538         return u;
539 }
540
541
542 /* utf_new_u2 ******************************************************************
543
544    Make utf symbol from u2 array, if isclassname is true '.' is
545    replaced by '/'.
546
547 *******************************************************************************/
548
549 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
550 {
551         char *buffer;                   /* memory buffer for  unicode characters  */
552         char *pos;                      /* pointer to current position in buffer  */
553         u4 left;                        /* unicode characters left                */
554         u4 buflength;                   /* utf length in bytes of the u2 array    */
555         utf *result;                    /* resulting utf-string                   */
556         int i;          
557
558         /* determine utf length in bytes and allocate memory */
559
560         buflength = u2_utflength(unicode_pos, unicode_length); 
561         buffer    = MNEW(char, buflength);
562  
563         left = buflength;
564         pos  = buffer;
565
566         for (i = 0; i++ < unicode_length; unicode_pos++) {
567                 /* next unicode character */
568                 u2 c = *unicode_pos;
569                 
570                 if ((c != 0) && (c < 0x80)) {
571                         /* 1 character */       
572                         left--;
573                 if ((int) left < 0) break;
574                         /* convert classname */
575                         if (isclassname && c == '.')
576                                 *pos++ = '/';
577                         else
578                                 *pos++ = (char) c;
579
580                 } else if (c < 0x800) {             
581                         /* 2 characters */                              
582                 unsigned char high = c >> 6;
583                 unsigned char low  = c & 0x3F;
584                         left = left - 2;
585                 if ((int) left < 0) break;
586                 *pos++ = high | 0xC0; 
587                 *pos++ = low  | 0x80;     
588
589                 } else {         
590                 /* 3 characters */                              
591                 char low  = c & 0x3f;
592                 char mid  = (c >> 6) & 0x3F;
593                 char high = c >> 12;
594                         left = left - 3;
595                 if ((int) left < 0) break;
596                 *pos++ = high | 0xE0; 
597                 *pos++ = mid  | 0x80;  
598                 *pos++ = low  | 0x80;   
599                 }
600         }
601         
602         /* insert utf-string into symbol-table */
603         result = utf_new(buffer,buflength);
604
605         MFREE(buffer, char, buflength);
606
607         return result;
608 }
609
610
611 /* utf_new_char ****************************************************************
612
613    Creates a new utf symbol, the text for this symbol is passed as a
614    c-string ( = char* ).
615
616 *******************************************************************************/
617
618 utf *utf_new_char(const char *text)
619 {
620         return utf_new(text, strlen(text));
621 }
622
623
624 /* utf_new_char_classname ******************************************************
625
626    Creates a new utf symbol, the text for this symbol is passed as a
627    c-string ( = char* ) "." characters are going to be replaced by
628    "/". Since the above function is used often, this is a separte
629    function, instead of an if.
630
631 *******************************************************************************/
632
633 utf *utf_new_char_classname(const char *text)
634 {
635         if (strchr(text, '.')) {
636                 char *txt = strdup(text);
637                 char *end = txt + strlen(txt);
638                 char *c;
639                 utf *tmpRes;
640
641                 for (c = txt; c < end; c++)
642                         if (*c == '.') *c = '/';
643
644                 tmpRes = utf_new(txt, strlen(txt));
645                 FREE(txt, 0);
646
647                 return tmpRes;
648
649         } else
650                 return utf_new(text, strlen(text));
651 }
652
653
654 /* utf_nextu2 ******************************************************************
655
656    Read the next unicode character from the utf string and increment
657    the utf-string pointer accordingly.
658
659 *******************************************************************************/
660
661 u2 utf_nextu2(char **utf_ptr)
662 {
663     /* uncompressed unicode character */
664     u2 unicode_char = 0;
665     /* current position in utf text */  
666     unsigned char *utf = (unsigned char *) (*utf_ptr);
667     /* bytes representing the unicode character */
668     unsigned char ch1, ch2, ch3;
669     /* number of bytes used to represent the unicode character */
670     int len = 0;
671         
672     switch ((ch1 = utf[0]) >> 4) {
673         default: /* 1 byte */
674                 (*utf_ptr)++;
675                 return (u2) ch1;
676         case 0xC: 
677         case 0xD: /* 2 bytes */
678                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
679                         unsigned char high = ch1 & 0x1F;
680                         unsigned char low  = ch2 & 0x3F;
681                         unicode_char = (high << 6) + low;
682                         len = 2;
683                 }
684                 break;
685
686         case 0xE: /* 2 or 3 bytes */
687                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
688                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
689                                 unsigned char low  = ch3 & 0x3f;
690                                 unsigned char mid  = ch2 & 0x3f;
691                                 unsigned char high = ch1 & 0x0f;
692                                 unicode_char = (((high << 6) + mid) << 6) + low;
693                                 len = 3;
694                         } else
695                                 len = 2;                                           
696                 }
697                 break;
698     }
699
700     /* update position in utf-text */
701     *utf_ptr = (char *) (utf + len);
702
703     return unicode_char;
704 }
705
706
707 /* utf_strlen ******************************************************************
708
709    Determine number of unicode characters in the utf string.
710
711 *******************************************************************************/
712
713 u4 utf_strlen(utf *u)
714 {
715         char *endpos;                       /* points behind utf string           */
716         char *utf_ptr;                      /* current position in utf text       */
717         u4 len = 0;                         /* number of unicode characters       */
718
719         if (!u) {
720                 *exceptionptr = new_nullpointerexception();
721                 return 0;
722         }
723
724         endpos = UTF_END(u);
725         utf_ptr = u->text;
726
727         while (utf_ptr < endpos) {
728                 len++;
729                 /* next unicode character */
730                 utf_nextu2(&utf_ptr);
731         }
732
733         if (utf_ptr != endpos)
734                 /* string ended abruptly */
735                 throw_cacao_exception_exit(string_java_lang_InternalError,
736                                                                    "Illegal utf8 string");
737
738         return len;
739 }
740
741
742 /* u2_utflength ****************************************************************
743
744    Returns the utf length in bytes of a u2 array.
745
746 *******************************************************************************/
747
748 u4 u2_utflength(u2 *text, u4 u2_length)
749 {
750         u4 result_len = 0;                  /* utf length in bytes                */
751         u2 ch;                              /* current unicode character          */
752         u4 len;
753         
754         for (len = 0; len < u2_length; len++) {
755                 /* next unicode character */
756                 ch = *text++;
757           
758                 /* determine bytes required to store unicode character as utf */
759                 if (ch && (ch < 0x80)) 
760                         result_len++;
761                 else if (ch < 0x800)
762                         result_len += 2;        
763                 else 
764                         result_len += 3;        
765         }
766
767     return result_len;
768 }
769
770
771 /* utf_display *****************************************************************
772
773    Write utf symbol to stdout (for debugging purposes).
774
775 *******************************************************************************/
776
777 void utf_display(utf *u)
778 {
779         char *endpos;                       /* points behind utf string           */
780         char *utf_ptr;                      /* current position in utf text       */
781
782         if (!u) {
783                 printf("NULL");
784                 fflush(stdout);
785                 return;
786         }
787
788         endpos = UTF_END(u);
789         utf_ptr = u->text;
790
791         while (utf_ptr < endpos) {
792                 /* read next unicode character */                
793                 u2 c = utf_nextu2(&utf_ptr);
794                 if (c >= 32 && c <= 127) printf("%c", c);
795                 else printf("?");
796         }
797
798         fflush(stdout);
799 }
800
801
802 /* utf_display_classname *******************************************************
803
804    Write utf symbol to stdout with `/' converted to `.' (for debugging
805    purposes).
806
807 *******************************************************************************/
808
809 void utf_display_classname(utf *u)
810 {
811         char *endpos;                       /* points behind utf string           */
812         char *utf_ptr;                      /* current position in utf text       */
813
814         if (!u) {
815                 printf("NULL");
816                 fflush(stdout);
817                 return;
818         }
819
820         endpos = UTF_END(u);
821         utf_ptr = u->text;
822
823         while (utf_ptr < endpos) {
824                 /* read next unicode character */                
825                 u2 c = utf_nextu2(&utf_ptr);
826                 if (c == '/') c = '.';
827                 if (c >= 32 && c <= 127) printf("%c", c);
828                 else printf("?");
829         }
830
831         fflush(stdout);
832 }
833
834
835 /* utf_sprint ******************************************************************
836         
837    Write utf symbol into c-string (for debugging purposes).
838
839 *******************************************************************************/
840
841 void utf_sprint(char *buffer, utf *u)
842 {
843         char *endpos;                       /* points behind utf string           */
844         char *utf_ptr;                      /* current position in utf text       */
845         u2 pos = 0;                         /* position in c-string               */
846
847         if (!u) {
848                 strcpy(buffer, "NULL");
849                 return;
850         }
851
852         endpos = UTF_END(u);
853         utf_ptr = u->text;
854
855         while (utf_ptr < endpos) 
856                 /* copy next unicode character */       
857                 buffer[pos++] = utf_nextu2(&utf_ptr);
858
859         /* terminate string */
860         buffer[pos] = '\0';
861 }
862
863
864 /* utf_sprint_classname ********************************************************
865         
866    Write utf symbol into c-string with `/' converted to `.' (for debugging
867    purposes).
868
869 *******************************************************************************/
870
871 void utf_sprint_classname(char *buffer, utf *u)
872 {
873         char *endpos;                       /* points behind utf string           */
874         char *utf_ptr;                      /* current position in utf text       */
875         u2 pos = 0;                         /* position in c-string               */
876
877         if (!u) {
878                 strcpy(buffer, "NULL");
879                 return;
880         }
881
882         endpos = UTF_END(u);
883         utf_ptr = u->text;
884
885         while (utf_ptr < endpos) {
886                 /* copy next unicode character */       
887                 u2 c = utf_nextu2(&utf_ptr);
888                 if (c == '/') c = '.';
889                 buffer[pos++] = c;
890         }
891
892         /* terminate string */
893         buffer[pos] = '\0';
894 }
895
896
897 /* utf_strcat ******************************************************************
898         
899    Like libc strcat, but uses an utf8 string.
900
901 *******************************************************************************/
902
903 void utf_strcat(char *buffer, utf *u)
904 {
905         utf_sprint(buffer + strlen(buffer), u);
906 }
907
908
909 /* utf_strcat_classname ********************************************************
910         
911    Like libc strcat, but uses an utf8 string.
912
913 *******************************************************************************/
914
915 void utf_strcat_classname(char *buffer, utf *u)
916 {
917         utf_sprint_classname(buffer + strlen(buffer), u);
918 }
919
920
921 /* utf_fprint ******************************************************************
922         
923    Write utf symbol into file.
924
925 *******************************************************************************/
926
927 void utf_fprint(FILE *file, utf *u)
928 {
929         char *endpos;                       /* points behind utf string           */
930         char *utf_ptr;                      /* current position in utf text       */
931
932         if (!u)
933                 return;
934
935         endpos = UTF_END(u);
936         utf_ptr = u->text;
937
938         while (utf_ptr < endpos) { 
939                 /* read next unicode character */                
940                 u2 c = utf_nextu2(&utf_ptr);                            
941
942                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
943                 else fprintf(file, "?");
944         }
945 }
946
947
948 /* utf_fprint_classname ********************************************************
949         
950    Write utf symbol into file with `/' converted to `.'.
951
952 *******************************************************************************/
953
954 void utf_fprint_classname(FILE *file, utf *u)
955 {
956         char *endpos;                       /* points behind utf string           */
957         char *utf_ptr;                      /* current position in utf text       */
958
959     if (!u)
960                 return;
961
962         endpos = UTF_END(u);
963         utf_ptr = u->text;
964
965         while (utf_ptr < endpos) { 
966                 /* read next unicode character */                
967                 u2 c = utf_nextu2(&utf_ptr);                            
968                 if (c == '/') c = '.';
969
970                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
971                 else fprintf(file, "?");
972         }
973 }
974
975
976 /* is_valid_utf ****************************************************************
977
978    Return true if the given string is a valid UTF-8 string.
979
980    utf_ptr...points to first character
981    end_pos...points after last character
982
983 *******************************************************************************/
984
985 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
986
987 bool is_valid_utf(char *utf_ptr, char *end_pos)
988 {
989         int bytes;
990         int len,i;
991         char c;
992         unsigned long v;
993
994         if (end_pos < utf_ptr) return false;
995         bytes = end_pos - utf_ptr;
996         while (bytes--) {
997                 c = *utf_ptr++;
998
999                 if (!c) return false;                     /* 0x00 is not allowed */
1000                 if ((c & 0x80) == 0) continue;            /* ASCII */
1001
1002                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1003                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1004                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1005                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1006                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1007                 else return false;                        /* invalid leading byte */
1008
1009                 if (len > 2) return false;                /* Java limitation */
1010
1011                 v = (unsigned long)c & (0x3f >> len);
1012                 
1013                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1014
1015                 for (i = len; i--; ) {
1016                         c = *utf_ptr++;
1017                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1018                                 return false;
1019                         v = (v << 6) | (c & 0x3f);
1020                 }
1021
1022                 if (v == 0) {
1023                         if (len != 1) return false;           /* Java special */
1024
1025                 } else {
1026                         /* Sun Java seems to allow overlong UTF-8 encodings */
1027                         
1028                         /* if (v < min_codepoint[len]) */
1029                                 /* XXX throw exception? */
1030                 }
1031
1032                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1033                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1034
1035                 /* even these seem to be allowed */
1036                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1037         }
1038
1039         return true;
1040 }
1041
1042
1043 /* is_valid_name ***************************************************************
1044
1045    Return true if the given string may be used as a class/field/method
1046    name. (Currently this only disallows empty strings and control
1047    characters.)
1048
1049    NOTE: The string is assumed to have passed is_valid_utf!
1050
1051    utf_ptr...points to first character
1052    end_pos...points after last character
1053
1054 *******************************************************************************/
1055
1056 bool is_valid_name(char *utf_ptr, char *end_pos)
1057 {
1058         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1059
1060         while (utf_ptr < end_pos) {
1061                 unsigned char c = *utf_ptr++;
1062
1063                 if (c < 0x20) return false; /* disallow control characters */
1064                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1065                         return false;
1066         }
1067
1068         return true;
1069 }
1070
1071 bool is_valid_name_utf(utf *u)
1072 {
1073         return is_valid_name(u->text, UTF_END(u));
1074 }
1075
1076
1077 /* utf_show ********************************************************************
1078
1079    Writes the utf symbols in the utfhash to stdout and displays the
1080    number of external hash chains grouped according to the chainlength
1081    (for debugging purposes).
1082
1083 *******************************************************************************/
1084
1085 void utf_show(void)
1086 {
1087
1088 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1089
1090         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1091         u4 max_chainlength = 0;      /* maximum length of the chains */
1092         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1093         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1094         u4 i;
1095
1096         printf ("UTF-HASH:\n");
1097
1098         /* show element of utf-hashtable */
1099         for (i=0; i<utf_hash.size; i++) {
1100                 utf *u = utf_hash.ptr[i];
1101                 if (u) {
1102                         printf ("SLOT %d: ", (int) i);
1103                         while (u) {
1104                                 printf ("'");
1105                                 utf_display (u);
1106                                 printf ("' ");
1107                                 u = u->hashlink;
1108                         }       
1109                         printf ("\n");
1110                 }
1111                 
1112         }
1113
1114         printf ("UTF-HASH: %d slots for %d entries\n", 
1115                         (int) utf_hash.size, (int) utf_hash.entries );
1116
1117
1118         if (utf_hash.entries == 0)
1119                 return;
1120
1121         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1122
1123         for (i=0;i<CHAIN_LIMIT;i++)
1124                 chain_count[i]=0;
1125
1126         /* count numbers of hashchains according to their length */
1127         for (i=0; i<utf_hash.size; i++) {
1128                   
1129                 utf *u = (utf*) utf_hash.ptr[i];
1130                 u4 chain_length = 0;
1131
1132                 /* determine chainlength */
1133                 while (u) {
1134                         u = u->hashlink;
1135                         chain_length++;
1136                 }
1137
1138                 /* update sum of all chainlengths */
1139                 sum_chainlength+=chain_length;
1140
1141                 /* determine the maximum length of the chains */
1142                 if (chain_length>max_chainlength)
1143                         max_chainlength = chain_length;
1144
1145                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1146                 if (chain_length>=CHAIN_LIMIT) {
1147                         beyond_limit+=chain_length;
1148                         chain_length=CHAIN_LIMIT-1;
1149                 }
1150
1151                 /* update number of hashchains of current length */
1152                 chain_count[chain_length]++;
1153         }
1154
1155         /* display results */  
1156         for (i=1;i<CHAIN_LIMIT-1;i++) 
1157                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1158           
1159         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1160
1161
1162         printf("max. chainlength:%5d\n",max_chainlength);
1163
1164         /* avg. chainlength = sum of chainlengths / number of chains */
1165         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1166 }
1167
1168
1169 /*
1170  * These are local overrides for various environment variables in Emacs.
1171  * Please do not remove this and leave it at the end of the file, where
1172  * Emacs will automagically detect them.
1173  * ---------------------------------------------------------------------
1174  * Local variables:
1175  * mode: c
1176  * indent-tabs-mode: t
1177  * c-basic-offset: 4
1178  * tab-width: 4
1179  * End:
1180  */