* Added utf_run
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2741 2005-06-20 09:57:14Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_Exception;
69 utf *utf_java_lang_NoClassDefFoundError;
70 utf *utf_java_lang_OutOfMemoryError;
71 utf *utf_java_lang_ClassNotFoundException;
72
73 utf* utf_java_lang_Void;
74 utf* utf_java_lang_Boolean;
75 utf* utf_java_lang_Byte;
76 utf* utf_java_lang_Character;
77 utf* utf_java_lang_Short;
78 utf* utf_java_lang_Integer;
79 utf* utf_java_lang_Long;
80 utf* utf_java_lang_Float;
81 utf* utf_java_lang_Double;
82
83 utf *utf_java_lang_StackTraceElement;
84 utf *utf_java_lang_reflect_Constructor;
85 utf *utf_java_lang_reflect_Field;
86 utf *utf_java_lang_reflect_Method;
87 utf *utf_java_util_Vector;
88
89 utf *utf_InnerClasses;                  /* InnerClasses                       */
90 utf *utf_ConstantValue;                 /* ConstantValue                      */
91 utf *utf_Code;                          /* Code                               */
92 utf *utf_Exceptions;                    /* Exceptions                         */
93 utf *utf_LineNumberTable;               /* LineNumberTable                    */
94 utf *utf_SourceFile;                    /* SourceFile                         */
95
96 utf *utf_init;                          /* <init>                             */
97 utf *utf_clinit;                        /* <clinit>                           */
98 utf *utf_clone;                         /* clone                              */
99 utf *utf_finalize;                      /* finalize                           */
100 utf *utf_run;                           /* run                                */
101
102 utf *utf_fillInStackTrace;
103 utf *utf_getSystemClassLoader;
104 utf *utf_loadClass;
105 utf *utf_printStackTrace;
106
107 utf *utf_void__void;                    /* ()V                                */
108 utf *utf_boolean__void;                 /* (Z)V                               */
109 utf *utf_byte__void;                    /* (B)V                               */
110 utf *utf_char__void;                    /* (C)V                               */
111 utf *utf_short__void;                   /* (S)V                               */
112 utf *utf_int__void;                     /* (I)V                               */
113 utf *utf_long__void;                    /* (J)V                               */
114 utf *utf_float__void;                   /* (F)V                               */
115 utf *utf_double__void;                  /* (D)V                               */
116
117 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
118 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
119 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
120 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
121 utf *utf_java_lang_String__java_lang_Class;
122 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
123
124 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
125
126 utf *array_packagename;
127
128
129 /* utf_init ********************************************************************
130
131    Initializes the utf8 subsystem.
132
133 *******************************************************************************/
134
135 void utf8_init(void)
136 {
137         /* create utf-symbols for pointer comparison of frequently used strings */
138
139         utf_java_lang_Object           = utf_new_char("java/lang/Object");
140
141         utf_java_lang_Class            = utf_new_char("java/lang/Class");
142         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
143         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
144         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
145         utf_java_lang_String           = utf_new_char("java/lang/String");
146         utf_java_lang_System           = utf_new_char("java/lang/System");
147         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
148         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
149
150         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
151         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
152         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
153         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
154
155         utf_java_lang_NoClassDefFoundError =
156                 utf_new_char(string_java_lang_NoClassDefFoundError);
157
158         utf_java_lang_OutOfMemoryError =
159                 utf_new_char(string_java_lang_OutOfMemoryError);
160
161         utf_java_lang_ClassNotFoundException =
162                 utf_new_char(string_java_lang_ClassNotFoundException);
163
164         utf_java_lang_Void             = utf_new_char("java/lang/Void");
165         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
166         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
167         utf_java_lang_Character        = utf_new_char("java/lang/Character");
168         utf_java_lang_Short            = utf_new_char("java/lang/Short");
169         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
170         utf_java_lang_Long             = utf_new_char("java/lang/Long");
171         utf_java_lang_Float            = utf_new_char("java/lang/Float");
172         utf_java_lang_Double           = utf_new_char("java/lang/Double");
173
174         utf_java_lang_StackTraceElement =
175                 utf_new_char("java/lang/StackTraceElement");
176
177         utf_java_lang_reflect_Constructor =
178                 utf_new_char("java/lang/reflect/Constructor");
179
180         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
181         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
182         utf_java_util_Vector           = utf_new_char("java/util/Vector");
183
184         utf_InnerClasses               = utf_new_char("InnerClasses");
185         utf_ConstantValue              = utf_new_char("ConstantValue");
186         utf_Code                       = utf_new_char("Code");
187         utf_Exceptions                 = utf_new_char("Exceptions");
188         utf_LineNumberTable            = utf_new_char("LineNumberTable");
189         utf_SourceFile                 = utf_new_char("SourceFile");
190
191         utf_init                           = utf_new_char("<init>");
192         utf_clinit                         = utf_new_char("<clinit>");
193         utf_clone                      = utf_new_char("clone");
194         utf_finalize                   = utf_new_char("finalize");
195         utf_run                        = utf_new_char("run");
196
197         utf_printStackTrace            = utf_new_char("printStackTrace");
198         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
199         utf_loadClass                  = utf_new_char("loadClass");
200         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
201
202         utf_void__void                 = utf_new_char("()V");
203         utf_boolean__void              = utf_new_char("(Z)V");
204         utf_byte__void                 = utf_new_char("(B)V");
205         utf_char__void                 = utf_new_char("(C)V");
206         utf_short__void                = utf_new_char("(S)V");
207         utf_int__void                  = utf_new_char("(I)V");
208         utf_long__void                 = utf_new_char("(J)V");
209         utf_float__void                = utf_new_char("(F)V");
210         utf_double__void               = utf_new_char("(D)V");
211         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
212         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
213
214         utf_void__java_lang_ClassLoader =
215                 utf_new_char("()Ljava/lang/ClassLoader;");
216
217         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
218
219         utf_java_lang_String__java_lang_Class =
220                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
221
222         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
223
224         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
225
226         array_packagename              = utf_new_char("\t<the array package>");
227 }
228
229
230 /* utf_hashkey *****************************************************************
231
232    The hashkey is computed from the utf-text by using up to 8
233    characters.  For utf-symbols longer than 15 characters 3 characters
234    are taken from the beginning and the end, 2 characters are taken
235    from the middle.
236
237 *******************************************************************************/
238
239 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
240 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
241
242 u4 utf_hashkey(const char *text, u4 length)
243 {
244         const char *start_pos = text;       /* pointer to utf text                */
245         u4 a;
246
247         switch (length) {
248         case 0: /* empty string */
249                 return 0;
250
251         case 1: return fbs(0);
252         case 2: return fbs(0) ^ nbs(3);
253         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
254         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
255         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
256         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
257         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
258         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
259
260         case 9:
261                 a = fbs(0);
262                 a ^= nbs(1);
263                 a ^= nbs(2);
264                 text++;
265                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
266
267         case 10:
268                 a = fbs(0);
269                 text++;
270                 a ^= nbs(2);
271                 a ^= nbs(3);
272                 a ^= nbs(4);
273                 text++;
274                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
275
276         case 11:
277                 a = fbs(0);
278                 text++;
279                 a ^= nbs(2);
280                 a ^= nbs(3);
281                 a ^= nbs(4);
282                 text++;
283                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
284
285         case 12:
286                 a = fbs(0);
287                 text += 2;
288                 a ^= nbs(2);
289                 a ^= nbs(3);
290                 text++;
291                 a ^= nbs(5);
292                 a ^= nbs(6);
293                 a ^= nbs(7);
294                 text++;
295                 return a ^ nbs(9) ^ nbs(10);
296
297         case 13:
298                 a = fbs(0);
299                 a ^= nbs(1);
300                 text++;
301                 a ^= nbs(3);
302                 a ^= nbs(4);
303                 text += 2;      
304                 a ^= nbs(7);
305                 a ^= nbs(8);
306                 text += 2;
307                 return a ^ nbs(9) ^ nbs(10);
308
309         case 14:
310                 a = fbs(0);
311                 text += 2;      
312                 a ^= nbs(3);
313                 a ^= nbs(4);
314                 text += 2;      
315                 a ^= nbs(7);
316                 a ^= nbs(8);
317                 text += 2;
318                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
319
320         case 15:
321                 a = fbs(0);
322                 text += 2;      
323                 a ^= nbs(3);
324                 a ^= nbs(4);
325                 text += 2;      
326                 a ^= nbs(7);
327                 a ^= nbs(8);
328                 text += 2;
329                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
330
331         default:  /* 3 characters from beginning */
332                 a = fbs(0);
333                 text += 2;
334                 a ^= nbs(3);
335                 a ^= nbs(4);
336
337                 /* 2 characters from middle */
338                 text = start_pos + (length / 2);
339                 a ^= fbs(5);
340                 text += 2;
341                 a ^= nbs(6);    
342
343                 /* 3 characters from end */
344                 text = start_pos + length - 4;
345
346                 a ^= fbs(7);
347                 text++;
348
349                 return a ^ nbs(10) ^ nbs(11);
350     }
351 }
352
353
354 /* utf_hashkey *****************************************************************
355
356    Compute the hashkey of a unicode string.
357
358 *******************************************************************************/
359
360 u4 unicode_hashkey(u2 *text, u2 len)
361 {
362         return utf_hashkey((char *) text, len);
363 }
364
365
366 /* utf_new *********************************************************************
367
368    Creates a new utf-symbol, the text of the symbol is passed as a
369    u1-array. The function searches the utf-hashtable for a utf-symbol
370    with this text. On success the element returned, otherwise a new
371    hashtable element is created.
372
373    If the number of entries in the hashtable exceeds twice the size of
374    the hashtable slots a reorganization of the hashtable is done and
375    the utf symbols are copied to a new hashtable with doubled size.
376
377 *******************************************************************************/
378
379 utf *utf_new_intern(const char *text, u2 length);
380
381 utf *utf_new(const char *text, u2 length)
382 {
383     utf *r;
384
385 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
386     tables_lock();
387 #endif
388
389     r = utf_new_intern(text, length);
390
391 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
392     tables_unlock();
393 #endif
394
395     return r;
396 }
397
398
399 utf *utf_new_intern(const char *text, u2 length)
400 {
401         u4 key;                             /* hashkey computed from utf-text     */
402         u4 slot;                            /* slot in hashtable                  */
403         utf *u;                             /* hashtable element                  */
404         u2 i;
405
406 #ifdef STATISTICS
407         if (opt_stat)
408                 count_utf_new++;
409 #endif
410
411         key  = utf_hashkey(text, length);
412         slot = key & (utf_hash.size - 1);
413         u    = utf_hash.ptr[slot];
414
415         /* search external hash chain for utf-symbol */
416         while (u) {
417                 if (u->blength == length) {
418
419                         /* compare text of hashtable elements */
420                         for (i = 0; i < length; i++)
421                                 if (text[i] != u->text[i]) goto nomatch;
422                         
423 #ifdef STATISTICS
424                         if (opt_stat)
425                                 count_utf_new_found++;
426 #endif
427
428                         /* symbol found in hashtable */
429                         return u;
430                 }
431         nomatch:
432                 u = u->hashlink; /* next element in external chain */
433         }
434
435 #ifdef STATISTICS
436         if (opt_stat)
437                 count_utf_len += sizeof(utf) + length;
438 #endif
439
440         /* location in hashtable found, create new utf element */
441         u = NEW(utf);
442         u->blength  = length;               /* length in bytes of utfstring       */
443         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
444         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
445         memcpy(u->text, text, length);      /* copy utf-text                      */
446         u->text[length] = '\0';
447         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
448
449         utf_hash.entries++;                 /* update number of entries           */
450
451         if (utf_hash.entries > (utf_hash.size * 2)) {
452
453         /* reorganization of hashtable, average length of 
454            the external chains is approx. 2                */  
455
456                 u4 i;
457                 utf *u;
458                 hashtable newhash; /* the new hashtable */
459
460                 /* create new hashtable, double the size */
461                 init_hashtable(&newhash, utf_hash.size * 2);
462                 newhash.entries = utf_hash.entries;
463
464 #ifdef STATISTICS
465                 if (opt_stat)
466                         count_utf_len += sizeof(utf*) * utf_hash.size;
467 #endif
468
469                 /* transfer elements to new hashtable */
470                 for (i = 0; i < utf_hash.size; i++) {
471                         u = (utf *) utf_hash.ptr[i];
472                         while (u) {
473                                 utf *nextu = u->hashlink;
474                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
475                                                 
476                                 u->hashlink = (utf *) newhash.ptr[slot];
477                                 newhash.ptr[slot] = u;
478
479                                 /* follow link in external hash chain */
480                                 u = nextu;
481                         }
482                 }
483         
484                 /* dispose old table */
485                 MFREE(utf_hash.ptr, void*, utf_hash.size);
486                 utf_hash = newhash;
487         }
488
489         return u;
490 }
491
492
493 /* utf_new_u2 ******************************************************************
494
495    Make utf symbol from u2 array, if isclassname is true '.' is
496    replaced by '/'.
497
498 *******************************************************************************/
499
500 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
501 {
502         char *buffer;                   /* memory buffer for  unicode characters  */
503         char *pos;                      /* pointer to current position in buffer  */
504         u4 left;                        /* unicode characters left                */
505         u4 buflength;                   /* utf length in bytes of the u2 array    */
506         utf *result;                    /* resulting utf-string                   */
507         int i;          
508
509         /* determine utf length in bytes and allocate memory */
510
511         buflength = u2_utflength(unicode_pos, unicode_length); 
512         buffer    = MNEW(char, buflength);
513  
514         left = buflength;
515         pos  = buffer;
516
517         for (i = 0; i++ < unicode_length; unicode_pos++) {
518                 /* next unicode character */
519                 u2 c = *unicode_pos;
520                 
521                 if ((c != 0) && (c < 0x80)) {
522                         /* 1 character */       
523                         left--;
524                 if ((int) left < 0) break;
525                         /* convert classname */
526                         if (isclassname && c == '.')
527                                 *pos++ = '/';
528                         else
529                                 *pos++ = (char) c;
530
531                 } else if (c < 0x800) {             
532                         /* 2 characters */                              
533                 unsigned char high = c >> 6;
534                 unsigned char low  = c & 0x3F;
535                         left = left - 2;
536                 if ((int) left < 0) break;
537                 *pos++ = high | 0xC0; 
538                 *pos++ = low  | 0x80;     
539
540                 } else {         
541                 /* 3 characters */                              
542                 char low  = c & 0x3f;
543                 char mid  = (c >> 6) & 0x3F;
544                 char high = c >> 12;
545                         left = left - 3;
546                 if ((int) left < 0) break;
547                 *pos++ = high | 0xE0; 
548                 *pos++ = mid  | 0x80;  
549                 *pos++ = low  | 0x80;   
550                 }
551         }
552         
553         /* insert utf-string into symbol-table */
554         result = utf_new(buffer,buflength);
555
556         MFREE(buffer, char, buflength);
557
558         return result;
559 }
560
561
562 /* utf_new_char ****************************************************************
563
564    Creates a new utf symbol, the text for this symbol is passed as a
565    c-string ( = char* ).
566
567 *******************************************************************************/
568
569 utf *utf_new_char(const char *text)
570 {
571         return utf_new(text, strlen(text));
572 }
573
574
575 /* utf_new_char_classname ******************************************************
576
577    Creates a new utf symbol, the text for this symbol is passed as a
578    c-string ( = char* ) "." characters are going to be replaced by
579    "/". Since the above function is used often, this is a separte
580    function, instead of an if.
581
582 *******************************************************************************/
583
584 utf *utf_new_char_classname(const char *text)
585 {
586         if (strchr(text, '.')) {
587                 char *txt = strdup(text);
588                 char *end = txt + strlen(txt);
589                 char *c;
590                 utf *tmpRes;
591
592                 for (c = txt; c < end; c++)
593                         if (*c == '.') *c = '/';
594
595                 tmpRes = utf_new(txt, strlen(txt));
596                 FREE(txt, 0);
597
598                 return tmpRes;
599
600         } else
601                 return utf_new(text, strlen(text));
602 }
603
604
605 /* utf_nextu2 ******************************************************************
606
607    Read the next unicode character from the utf string and increment
608    the utf-string pointer accordingly.
609
610 *******************************************************************************/
611
612 u2 utf_nextu2(char **utf_ptr)
613 {
614     /* uncompressed unicode character */
615     u2 unicode_char = 0;
616     /* current position in utf text */  
617     unsigned char *utf = (unsigned char *) (*utf_ptr);
618     /* bytes representing the unicode character */
619     unsigned char ch1, ch2, ch3;
620     /* number of bytes used to represent the unicode character */
621     int len = 0;
622         
623     switch ((ch1 = utf[0]) >> 4) {
624         default: /* 1 byte */
625                 (*utf_ptr)++;
626                 return (u2) ch1;
627         case 0xC: 
628         case 0xD: /* 2 bytes */
629                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
630                         unsigned char high = ch1 & 0x1F;
631                         unsigned char low  = ch2 & 0x3F;
632                         unicode_char = (high << 6) + low;
633                         len = 2;
634                 }
635                 break;
636
637         case 0xE: /* 2 or 3 bytes */
638                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
639                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
640                                 unsigned char low  = ch3 & 0x3f;
641                                 unsigned char mid  = ch2 & 0x3f;
642                                 unsigned char high = ch1 & 0x0f;
643                                 unicode_char = (((high << 6) + mid) << 6) + low;
644                                 len = 3;
645                         } else
646                                 len = 2;                                           
647                 }
648                 break;
649     }
650
651     /* update position in utf-text */
652     *utf_ptr = (char *) (utf + len);
653
654     return unicode_char;
655 }
656
657
658 /* utf_strlen ******************************************************************
659
660    Determine number of unicode characters in the utf string.
661
662 *******************************************************************************/
663
664 u4 utf_strlen(utf *u)
665 {
666         char *endpos;                       /* points behind utf string           */
667         char *utf_ptr;                      /* current position in utf text       */
668         u4 len = 0;                         /* number of unicode characters       */
669
670         if (!u) {
671                 *exceptionptr = new_nullpointerexception();
672                 return 0;
673         }
674
675         endpos = UTF_END(u);
676         utf_ptr = u->text;
677
678         while (utf_ptr < endpos) {
679                 len++;
680                 /* next unicode character */
681                 utf_nextu2(&utf_ptr);
682         }
683
684         if (utf_ptr != endpos)
685                 /* string ended abruptly */
686                 throw_cacao_exception_exit(string_java_lang_InternalError,
687                                                                    "Illegal utf8 string");
688
689         return len;
690 }
691
692
693 /* u2_utflength ****************************************************************
694
695    Returns the utf length in bytes of a u2 array.
696
697 *******************************************************************************/
698
699 u4 u2_utflength(u2 *text, u4 u2_length)
700 {
701         u4 result_len = 0;                  /* utf length in bytes                */
702         u2 ch;                              /* current unicode character          */
703         u4 len;
704         
705         for (len = 0; len < u2_length; len++) {
706                 /* next unicode character */
707                 ch = *text++;
708           
709                 /* determine bytes required to store unicode character as utf */
710                 if (ch && (ch < 0x80)) 
711                         result_len++;
712                 else if (ch < 0x800)
713                         result_len += 2;        
714                 else 
715                         result_len += 3;        
716         }
717
718     return result_len;
719 }
720
721
722 /* utf_display *****************************************************************
723
724    Write utf symbol to stdout (for debugging purposes).
725
726 *******************************************************************************/
727
728 void utf_display(utf *u)
729 {
730         char *endpos;                       /* points behind utf string           */
731         char *utf_ptr;                      /* current position in utf text       */
732
733         if (!u) {
734                 printf("NULL");
735                 fflush(stdout);
736                 return;
737         }
738
739         endpos = UTF_END(u);
740         utf_ptr = u->text;
741
742         while (utf_ptr < endpos) {
743                 /* read next unicode character */                
744                 u2 c = utf_nextu2(&utf_ptr);
745                 if (c >= 32 && c <= 127) printf("%c", c);
746                 else printf("?");
747         }
748
749         fflush(stdout);
750 }
751
752
753 /* utf_display_classname *******************************************************
754
755    Write utf symbol to stdout with `/' converted to `.' (for debugging
756    purposes).
757
758 *******************************************************************************/
759
760 void utf_display_classname(utf *u)
761 {
762         char *endpos;                       /* points behind utf string           */
763         char *utf_ptr;                      /* current position in utf text       */
764
765         if (!u) {
766                 printf("NULL");
767                 fflush(stdout);
768                 return;
769         }
770
771         endpos = UTF_END(u);
772         utf_ptr = u->text;
773
774         while (utf_ptr < endpos) {
775                 /* read next unicode character */                
776                 u2 c = utf_nextu2(&utf_ptr);
777                 if (c == '/') c = '.';
778                 if (c >= 32 && c <= 127) printf("%c", c);
779                 else printf("?");
780         }
781
782         fflush(stdout);
783 }
784
785
786 /* utf_sprint ******************************************************************
787         
788    Write utf symbol into c-string (for debugging purposes).
789
790 *******************************************************************************/
791
792 void utf_sprint(char *buffer, utf *u)
793 {
794         char *endpos;                       /* points behind utf string           */
795         char *utf_ptr;                      /* current position in utf text       */
796         u2 pos = 0;                         /* position in c-string               */
797
798         if (!u) {
799                 strcpy(buffer, "NULL");
800                 return;
801         }
802
803         endpos = UTF_END(u);
804         utf_ptr = u->text;
805
806         while (utf_ptr < endpos) 
807                 /* copy next unicode character */       
808                 buffer[pos++] = utf_nextu2(&utf_ptr);
809
810         /* terminate string */
811         buffer[pos] = '\0';
812 }
813
814
815 /* utf_sprint_classname ********************************************************
816         
817    Write utf symbol into c-string with `/' converted to `.' (for debugging
818    purposes).
819
820 *******************************************************************************/
821
822 void utf_sprint_classname(char *buffer, utf *u)
823 {
824         char *endpos;                       /* points behind utf string           */
825         char *utf_ptr;                      /* current position in utf text       */
826         u2 pos = 0;                         /* position in c-string               */
827
828         if (!u) {
829                 strcpy(buffer, "NULL");
830                 return;
831         }
832
833         endpos = UTF_END(u);
834         utf_ptr = u->text;
835
836         while (utf_ptr < endpos) {
837                 /* copy next unicode character */       
838                 u2 c = utf_nextu2(&utf_ptr);
839                 if (c == '/') c = '.';
840                 buffer[pos++] = c;
841         }
842
843         /* terminate string */
844         buffer[pos] = '\0';
845 }
846
847
848 /* utf_strcat ******************************************************************
849         
850    Like libc strcat, but uses an utf8 string.
851
852 *******************************************************************************/
853
854 void utf_strcat(char *buffer, utf *u)
855 {
856         utf_sprint(buffer + strlen(buffer), u);
857 }
858
859
860 /* utf_strcat_classname ********************************************************
861         
862    Like libc strcat, but uses an utf8 string.
863
864 *******************************************************************************/
865
866 void utf_strcat_classname(char *buffer, utf *u)
867 {
868         utf_sprint_classname(buffer + strlen(buffer), u);
869 }
870
871
872 /* utf_fprint ******************************************************************
873         
874    Write utf symbol into file.
875
876 *******************************************************************************/
877
878 void utf_fprint(FILE *file, utf *u)
879 {
880         char *endpos;                       /* points behind utf string           */
881         char *utf_ptr;                      /* current position in utf text       */
882
883         if (!u)
884                 return;
885
886         endpos = UTF_END(u);
887         utf_ptr = u->text;
888
889         while (utf_ptr < endpos) { 
890                 /* read next unicode character */                
891                 u2 c = utf_nextu2(&utf_ptr);                            
892
893                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
894                 else fprintf(file, "?");
895         }
896 }
897
898
899 /* utf_fprint_classname ********************************************************
900         
901    Write utf symbol into file with `/' converted to `.'.
902
903 *******************************************************************************/
904
905 void utf_fprint_classname(FILE *file, utf *u)
906 {
907         char *endpos;                       /* points behind utf string           */
908         char *utf_ptr;                      /* current position in utf text       */
909
910     if (!u)
911                 return;
912
913         endpos = UTF_END(u);
914         utf_ptr = u->text;
915
916         while (utf_ptr < endpos) { 
917                 /* read next unicode character */                
918                 u2 c = utf_nextu2(&utf_ptr);                            
919                 if (c == '/') c = '.';
920
921                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
922                 else fprintf(file, "?");
923         }
924 }
925
926
927 /* is_valid_utf ****************************************************************
928
929    Return true if the given string is a valid UTF-8 string.
930
931    utf_ptr...points to first character
932    end_pos...points after last character
933
934 *******************************************************************************/
935
936 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
937
938 bool is_valid_utf(char *utf_ptr, char *end_pos)
939 {
940         int bytes;
941         int len,i;
942         char c;
943         unsigned long v;
944
945         if (end_pos < utf_ptr) return false;
946         bytes = end_pos - utf_ptr;
947         while (bytes--) {
948                 c = *utf_ptr++;
949
950                 if (!c) return false;                     /* 0x00 is not allowed */
951                 if ((c & 0x80) == 0) continue;            /* ASCII */
952
953                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
954                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
955                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
956                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
957                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
958                 else return false;                        /* invalid leading byte */
959
960                 if (len > 2) return false;                /* Java limitation */
961
962                 v = (unsigned long)c & (0x3f >> len);
963                 
964                 if ((bytes -= len) < 0) return false;     /* missing bytes */
965
966                 for (i = len; i--; ) {
967                         c = *utf_ptr++;
968                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
969                                 return false;
970                         v = (v << 6) | (c & 0x3f);
971                 }
972
973                 if (v == 0) {
974                         if (len != 1) return false;           /* Java special */
975
976                 } else {
977                         /* Sun Java seems to allow overlong UTF-8 encodings */
978                         
979                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
980                                 if (!opt_liberalutf)
981                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
982                                 /* XXX change this to exception? */
983                         }
984                 }
985
986                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
987                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
988
989                 /* even these seem to be allowed */
990                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
991         }
992
993         return true;
994 }
995
996
997 /* is_valid_name ***************************************************************
998
999    Return true if the given string may be used as a class/field/method
1000    name. (Currently this only disallows empty strings and control
1001    characters.)
1002
1003    NOTE: The string is assumed to have passed is_valid_utf!
1004
1005    utf_ptr...points to first character
1006    end_pos...points after last character
1007
1008 *******************************************************************************/
1009
1010 bool is_valid_name(char *utf_ptr, char *end_pos)
1011 {
1012         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1013
1014         while (utf_ptr < end_pos) {
1015                 unsigned char c = *utf_ptr++;
1016
1017                 if (c < 0x20) return false; /* disallow control characters */
1018                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1019                         return false;
1020         }
1021
1022         return true;
1023 }
1024
1025 bool is_valid_name_utf(utf *u)
1026 {
1027         return is_valid_name(u->text, UTF_END(u));
1028 }
1029
1030
1031 /* utf_show ********************************************************************
1032
1033    Writes the utf symbols in the utfhash to stdout and displays the
1034    number of external hash chains grouped according to the chainlength
1035    (for debugging purposes).
1036
1037 *******************************************************************************/
1038
1039 void utf_show(void)
1040 {
1041
1042 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1043
1044         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1045         u4 max_chainlength = 0;      /* maximum length of the chains */
1046         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1047         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1048         u4 i;
1049
1050         printf ("UTF-HASH:\n");
1051
1052         /* show element of utf-hashtable */
1053         for (i=0; i<utf_hash.size; i++) {
1054                 utf *u = utf_hash.ptr[i];
1055                 if (u) {
1056                         printf ("SLOT %d: ", (int) i);
1057                         while (u) {
1058                                 printf ("'");
1059                                 utf_display (u);
1060                                 printf ("' ");
1061                                 u = u->hashlink;
1062                         }       
1063                         printf ("\n");
1064                 }
1065                 
1066         }
1067
1068         printf ("UTF-HASH: %d slots for %d entries\n", 
1069                         (int) utf_hash.size, (int) utf_hash.entries );
1070
1071
1072         if (utf_hash.entries == 0)
1073                 return;
1074
1075         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1076
1077         for (i=0;i<CHAIN_LIMIT;i++)
1078                 chain_count[i]=0;
1079
1080         /* count numbers of hashchains according to their length */
1081         for (i=0; i<utf_hash.size; i++) {
1082                   
1083                 utf *u = (utf*) utf_hash.ptr[i];
1084                 u4 chain_length = 0;
1085
1086                 /* determine chainlength */
1087                 while (u) {
1088                         u = u->hashlink;
1089                         chain_length++;
1090                 }
1091
1092                 /* update sum of all chainlengths */
1093                 sum_chainlength+=chain_length;
1094
1095                 /* determine the maximum length of the chains */
1096                 if (chain_length>max_chainlength)
1097                         max_chainlength = chain_length;
1098
1099                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1100                 if (chain_length>=CHAIN_LIMIT) {
1101                         beyond_limit+=chain_length;
1102                         chain_length=CHAIN_LIMIT-1;
1103                 }
1104
1105                 /* update number of hashchains of current length */
1106                 chain_count[chain_length]++;
1107         }
1108
1109         /* display results */  
1110         for (i=1;i<CHAIN_LIMIT-1;i++) 
1111                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1112           
1113         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1114
1115
1116         printf("max. chainlength:%5d\n",max_chainlength);
1117
1118         /* avg. chainlength = sum of chainlengths / number of chains */
1119         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1120 }
1121
1122
1123 /*
1124  * These are local overrides for various environment variables in Emacs.
1125  * Please do not remove this and leave it at the end of the file, where
1126  * Emacs will automagically detect them.
1127  * ---------------------------------------------------------------------
1128  * Local variables:
1129  * mode: c
1130  * indent-tabs-mode: t
1131  * c-basic-offset: 4
1132  * tab-width: 4
1133  * End:
1134  */