* utf_add: Added.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 3262 2005-09-21 20:02:49Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_Exception;
69 utf *utf_java_lang_NoClassDefFoundError;
70 utf *utf_java_lang_OutOfMemoryError;
71 utf *utf_java_lang_ClassNotFoundException;
72
73 utf* utf_java_lang_Void;
74 utf* utf_java_lang_Boolean;
75 utf* utf_java_lang_Byte;
76 utf* utf_java_lang_Character;
77 utf* utf_java_lang_Short;
78 utf* utf_java_lang_Integer;
79 utf* utf_java_lang_Long;
80 utf* utf_java_lang_Float;
81 utf* utf_java_lang_Double;
82
83 utf *utf_java_lang_StackTraceElement;
84 utf *utf_java_lang_reflect_Constructor;
85 utf *utf_java_lang_reflect_Field;
86 utf *utf_java_lang_reflect_Method;
87 utf *utf_java_util_Vector;
88
89 utf *utf_InnerClasses;                  /* InnerClasses                       */
90 utf *utf_ConstantValue;                 /* ConstantValue                      */
91 utf *utf_Code;                          /* Code                               */
92 utf *utf_Exceptions;                    /* Exceptions                         */
93 utf *utf_LineNumberTable;               /* LineNumberTable                    */
94 utf *utf_SourceFile;                    /* SourceFile                         */
95
96 utf *utf_init;                          /* <init>                             */
97 utf *utf_clinit;                        /* <clinit>                           */
98 utf *utf_clone;                         /* clone                              */
99 utf *utf_finalize;                      /* finalize                           */
100 utf *utf_run;                           /* run                                */
101
102 utf *utf_add;                           /* add                                */
103
104 utf *utf_fillInStackTrace;
105 utf *utf_getSystemClassLoader;
106 utf *utf_loadClass;
107 utf *utf_printStackTrace;
108
109 utf *utf_void__void;                    /* ()V                                */
110 utf *utf_boolean__void;                 /* (Z)V                               */
111 utf *utf_byte__void;                    /* (B)V                               */
112 utf *utf_char__void;                    /* (C)V                               */
113 utf *utf_short__void;                   /* (S)V                               */
114 utf *utf_int__void;                     /* (I)V                               */
115 utf *utf_long__void;                    /* (J)V                               */
116 utf *utf_float__void;                   /* (F)V                               */
117 utf *utf_double__void;                  /* (D)V                               */
118
119 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
120 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
121 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
122 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
123 utf *utf_java_lang_String__java_lang_Class;
124 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
125
126 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
127
128 utf *array_packagename;
129
130
131 /* utf_init ********************************************************************
132
133    Initializes the utf8 subsystem.
134
135 *******************************************************************************/
136
137 void utf8_init(void)
138 {
139         /* create utf-symbols for pointer comparison of frequently used strings */
140
141         utf_java_lang_Object           = utf_new_char("java/lang/Object");
142
143         utf_java_lang_Class            = utf_new_char("java/lang/Class");
144         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
145         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
146         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
147         utf_java_lang_String           = utf_new_char("java/lang/String");
148         utf_java_lang_System           = utf_new_char("java/lang/System");
149         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
150         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
151
152         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
153         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
154         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
155         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
156
157         utf_java_lang_NoClassDefFoundError =
158                 utf_new_char(string_java_lang_NoClassDefFoundError);
159
160         utf_java_lang_OutOfMemoryError =
161                 utf_new_char(string_java_lang_OutOfMemoryError);
162
163         utf_java_lang_ClassNotFoundException =
164                 utf_new_char(string_java_lang_ClassNotFoundException);
165
166         utf_java_lang_Void             = utf_new_char("java/lang/Void");
167         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
168         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
169         utf_java_lang_Character        = utf_new_char("java/lang/Character");
170         utf_java_lang_Short            = utf_new_char("java/lang/Short");
171         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
172         utf_java_lang_Long             = utf_new_char("java/lang/Long");
173         utf_java_lang_Float            = utf_new_char("java/lang/Float");
174         utf_java_lang_Double           = utf_new_char("java/lang/Double");
175
176         utf_java_lang_StackTraceElement =
177                 utf_new_char("java/lang/StackTraceElement");
178
179         utf_java_lang_reflect_Constructor =
180                 utf_new_char("java/lang/reflect/Constructor");
181
182         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
183         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
184         utf_java_util_Vector           = utf_new_char("java/util/Vector");
185
186         utf_InnerClasses               = utf_new_char("InnerClasses");
187         utf_ConstantValue              = utf_new_char("ConstantValue");
188         utf_Code                       = utf_new_char("Code");
189         utf_Exceptions                 = utf_new_char("Exceptions");
190         utf_LineNumberTable            = utf_new_char("LineNumberTable");
191         utf_SourceFile                 = utf_new_char("SourceFile");
192
193         utf_init                           = utf_new_char("<init>");
194         utf_clinit                         = utf_new_char("<clinit>");
195         utf_clone                      = utf_new_char("clone");
196         utf_finalize                   = utf_new_char("finalize");
197         utf_run                        = utf_new_char("run");
198
199         utf_add                        = utf_new_char("add");
200
201         utf_printStackTrace            = utf_new_char("printStackTrace");
202         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
203         utf_loadClass                  = utf_new_char("loadClass");
204         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
205
206         utf_void__void                 = utf_new_char("()V");
207         utf_boolean__void              = utf_new_char("(Z)V");
208         utf_byte__void                 = utf_new_char("(B)V");
209         utf_char__void                 = utf_new_char("(C)V");
210         utf_short__void                = utf_new_char("(S)V");
211         utf_int__void                  = utf_new_char("(I)V");
212         utf_long__void                 = utf_new_char("(J)V");
213         utf_float__void                = utf_new_char("(F)V");
214         utf_double__void               = utf_new_char("(D)V");
215         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
216         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
217
218         utf_void__java_lang_ClassLoader =
219                 utf_new_char("()Ljava/lang/ClassLoader;");
220
221         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
222
223         utf_java_lang_String__java_lang_Class =
224                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
225
226         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
227
228         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
229
230         array_packagename              = utf_new_char("\t<the array package>");
231 }
232
233
234 /* utf_hashkey *****************************************************************
235
236    The hashkey is computed from the utf-text by using up to 8
237    characters.  For utf-symbols longer than 15 characters 3 characters
238    are taken from the beginning and the end, 2 characters are taken
239    from the middle.
240
241 *******************************************************************************/
242
243 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
244 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
245
246 u4 utf_hashkey(const char *text, u4 length)
247 {
248         const char *start_pos = text;       /* pointer to utf text                */
249         u4 a;
250
251         switch (length) {
252         case 0: /* empty string */
253                 return 0;
254
255         case 1: return fbs(0);
256         case 2: return fbs(0) ^ nbs(3);
257         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
258         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
259         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
260         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
261         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
262         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
263
264         case 9:
265                 a = fbs(0);
266                 a ^= nbs(1);
267                 a ^= nbs(2);
268                 text++;
269                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
270
271         case 10:
272                 a = fbs(0);
273                 text++;
274                 a ^= nbs(2);
275                 a ^= nbs(3);
276                 a ^= nbs(4);
277                 text++;
278                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
279
280         case 11:
281                 a = fbs(0);
282                 text++;
283                 a ^= nbs(2);
284                 a ^= nbs(3);
285                 a ^= nbs(4);
286                 text++;
287                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
288
289         case 12:
290                 a = fbs(0);
291                 text += 2;
292                 a ^= nbs(2);
293                 a ^= nbs(3);
294                 text++;
295                 a ^= nbs(5);
296                 a ^= nbs(6);
297                 a ^= nbs(7);
298                 text++;
299                 return a ^ nbs(9) ^ nbs(10);
300
301         case 13:
302                 a = fbs(0);
303                 a ^= nbs(1);
304                 text++;
305                 a ^= nbs(3);
306                 a ^= nbs(4);
307                 text += 2;      
308                 a ^= nbs(7);
309                 a ^= nbs(8);
310                 text += 2;
311                 return a ^ nbs(9) ^ nbs(10);
312
313         case 14:
314                 a = fbs(0);
315                 text += 2;      
316                 a ^= nbs(3);
317                 a ^= nbs(4);
318                 text += 2;      
319                 a ^= nbs(7);
320                 a ^= nbs(8);
321                 text += 2;
322                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
323
324         case 15:
325                 a = fbs(0);
326                 text += 2;      
327                 a ^= nbs(3);
328                 a ^= nbs(4);
329                 text += 2;      
330                 a ^= nbs(7);
331                 a ^= nbs(8);
332                 text += 2;
333                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
334
335         default:  /* 3 characters from beginning */
336                 a = fbs(0);
337                 text += 2;
338                 a ^= nbs(3);
339                 a ^= nbs(4);
340
341                 /* 2 characters from middle */
342                 text = start_pos + (length / 2);
343                 a ^= fbs(5);
344                 text += 2;
345                 a ^= nbs(6);    
346
347                 /* 3 characters from end */
348                 text = start_pos + length - 4;
349
350                 a ^= fbs(7);
351                 text++;
352
353                 return a ^ nbs(10) ^ nbs(11);
354     }
355 }
356
357
358 /* utf_hashkey *****************************************************************
359
360    Compute the hashkey of a unicode string.
361
362 *******************************************************************************/
363
364 u4 unicode_hashkey(u2 *text, u2 len)
365 {
366         return utf_hashkey((char *) text, len);
367 }
368
369
370 /* utf_new *********************************************************************
371
372    Creates a new utf-symbol, the text of the symbol is passed as a
373    u1-array. The function searches the utf-hashtable for a utf-symbol
374    with this text. On success the element returned, otherwise a new
375    hashtable element is created.
376
377    If the number of entries in the hashtable exceeds twice the size of
378    the hashtable slots a reorganization of the hashtable is done and
379    the utf symbols are copied to a new hashtable with doubled size.
380
381 *******************************************************************************/
382
383 utf *utf_new_intern(const char *text, u2 length);
384
385 utf *utf_new(const char *text, u2 length)
386 {
387     utf *r;
388
389 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
390     tables_lock();
391 #endif
392
393     r = utf_new_intern(text, length);
394
395 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
396     tables_unlock();
397 #endif
398
399     return r;
400 }
401
402
403 utf *utf_new_intern(const char *text, u2 length)
404 {
405         u4 key;                             /* hashkey computed from utf-text     */
406         u4 slot;                            /* slot in hashtable                  */
407         utf *u;                             /* hashtable element                  */
408         u2 i;
409
410 #ifdef STATISTICS
411         if (opt_stat)
412                 count_utf_new++;
413 #endif
414
415         key  = utf_hashkey(text, length);
416         slot = key & (utf_hash.size - 1);
417         u    = utf_hash.ptr[slot];
418
419         /* search external hash chain for utf-symbol */
420         while (u) {
421                 if (u->blength == length) {
422
423                         /* compare text of hashtable elements */
424                         for (i = 0; i < length; i++)
425                                 if (text[i] != u->text[i]) goto nomatch;
426                         
427 #ifdef STATISTICS
428                         if (opt_stat)
429                                 count_utf_new_found++;
430 #endif
431
432                         /* symbol found in hashtable */
433                         return u;
434                 }
435         nomatch:
436                 u = u->hashlink; /* next element in external chain */
437         }
438
439 #ifdef STATISTICS
440         if (opt_stat)
441                 count_utf_len += sizeof(utf) + length;
442 #endif
443
444         /* location in hashtable found, create new utf element */
445         u = NEW(utf);
446         u->blength  = length;               /* length in bytes of utfstring       */
447         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
448         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
449         memcpy(u->text, text, length);      /* copy utf-text                      */
450         u->text[length] = '\0';
451         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
452
453         utf_hash.entries++;                 /* update number of entries           */
454
455         if (utf_hash.entries > (utf_hash.size * 2)) {
456
457         /* reorganization of hashtable, average length of 
458            the external chains is approx. 2                */  
459
460                 u4 i;
461                 utf *u;
462                 hashtable newhash; /* the new hashtable */
463
464                 /* create new hashtable, double the size */
465                 init_hashtable(&newhash, utf_hash.size * 2);
466                 newhash.entries = utf_hash.entries;
467
468 #ifdef STATISTICS
469                 if (opt_stat)
470                         count_utf_len += sizeof(utf*) * utf_hash.size;
471 #endif
472
473                 /* transfer elements to new hashtable */
474                 for (i = 0; i < utf_hash.size; i++) {
475                         u = (utf *) utf_hash.ptr[i];
476                         while (u) {
477                                 utf *nextu = u->hashlink;
478                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
479                                                 
480                                 u->hashlink = (utf *) newhash.ptr[slot];
481                                 newhash.ptr[slot] = u;
482
483                                 /* follow link in external hash chain */
484                                 u = nextu;
485                         }
486                 }
487         
488                 /* dispose old table */
489                 MFREE(utf_hash.ptr, void*, utf_hash.size);
490                 utf_hash = newhash;
491         }
492
493         return u;
494 }
495
496
497 /* utf_new_u2 ******************************************************************
498
499    Make utf symbol from u2 array, if isclassname is true '.' is
500    replaced by '/'.
501
502 *******************************************************************************/
503
504 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
505 {
506         char *buffer;                   /* memory buffer for  unicode characters  */
507         char *pos;                      /* pointer to current position in buffer  */
508         u4 left;                        /* unicode characters left                */
509         u4 buflength;                   /* utf length in bytes of the u2 array    */
510         utf *result;                    /* resulting utf-string                   */
511         int i;          
512
513         /* determine utf length in bytes and allocate memory */
514
515         buflength = u2_utflength(unicode_pos, unicode_length); 
516         buffer    = MNEW(char, buflength);
517  
518         left = buflength;
519         pos  = buffer;
520
521         for (i = 0; i++ < unicode_length; unicode_pos++) {
522                 /* next unicode character */
523                 u2 c = *unicode_pos;
524                 
525                 if ((c != 0) && (c < 0x80)) {
526                         /* 1 character */       
527                         left--;
528                 if ((int) left < 0) break;
529                         /* convert classname */
530                         if (isclassname && c == '.')
531                                 *pos++ = '/';
532                         else
533                                 *pos++ = (char) c;
534
535                 } else if (c < 0x800) {             
536                         /* 2 characters */                              
537                 unsigned char high = c >> 6;
538                 unsigned char low  = c & 0x3F;
539                         left = left - 2;
540                 if ((int) left < 0) break;
541                 *pos++ = high | 0xC0; 
542                 *pos++ = low  | 0x80;     
543
544                 } else {         
545                 /* 3 characters */                              
546                 char low  = c & 0x3f;
547                 char mid  = (c >> 6) & 0x3F;
548                 char high = c >> 12;
549                         left = left - 3;
550                 if ((int) left < 0) break;
551                 *pos++ = high | 0xE0; 
552                 *pos++ = mid  | 0x80;  
553                 *pos++ = low  | 0x80;   
554                 }
555         }
556         
557         /* insert utf-string into symbol-table */
558         result = utf_new(buffer,buflength);
559
560         MFREE(buffer, char, buflength);
561
562         return result;
563 }
564
565
566 /* utf_new_char ****************************************************************
567
568    Creates a new utf symbol, the text for this symbol is passed as a
569    c-string ( = char* ).
570
571 *******************************************************************************/
572
573 utf *utf_new_char(const char *text)
574 {
575         return utf_new(text, strlen(text));
576 }
577
578
579 /* utf_new_char_classname ******************************************************
580
581    Creates a new utf symbol, the text for this symbol is passed as a
582    c-string ( = char* ) "." characters are going to be replaced by
583    "/". Since the above function is used often, this is a separte
584    function, instead of an if.
585
586 *******************************************************************************/
587
588 utf *utf_new_char_classname(const char *text)
589 {
590         if (strchr(text, '.')) {
591                 char *txt = strdup(text);
592                 char *end = txt + strlen(txt);
593                 char *c;
594                 utf *tmpRes;
595
596                 for (c = txt; c < end; c++)
597                         if (*c == '.') *c = '/';
598
599                 tmpRes = utf_new(txt, strlen(txt));
600                 FREE(txt, 0);
601
602                 return tmpRes;
603
604         } else
605                 return utf_new(text, strlen(text));
606 }
607
608
609 /* utf_nextu2 ******************************************************************
610
611    Read the next unicode character from the utf string and increment
612    the utf-string pointer accordingly.
613
614 *******************************************************************************/
615
616 u2 utf_nextu2(char **utf_ptr)
617 {
618     /* uncompressed unicode character */
619     u2 unicode_char = 0;
620     /* current position in utf text */  
621     unsigned char *utf = (unsigned char *) (*utf_ptr);
622     /* bytes representing the unicode character */
623     unsigned char ch1, ch2, ch3;
624     /* number of bytes used to represent the unicode character */
625     int len = 0;
626         
627     switch ((ch1 = utf[0]) >> 4) {
628         default: /* 1 byte */
629                 (*utf_ptr)++;
630                 return (u2) ch1;
631         case 0xC: 
632         case 0xD: /* 2 bytes */
633                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
634                         unsigned char high = ch1 & 0x1F;
635                         unsigned char low  = ch2 & 0x3F;
636                         unicode_char = (high << 6) + low;
637                         len = 2;
638                 }
639                 break;
640
641         case 0xE: /* 2 or 3 bytes */
642                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
643                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
644                                 unsigned char low  = ch3 & 0x3f;
645                                 unsigned char mid  = ch2 & 0x3f;
646                                 unsigned char high = ch1 & 0x0f;
647                                 unicode_char = (((high << 6) + mid) << 6) + low;
648                                 len = 3;
649                         } else
650                                 len = 2;                                           
651                 }
652                 break;
653     }
654
655     /* update position in utf-text */
656     *utf_ptr = (char *) (utf + len);
657
658     return unicode_char;
659 }
660
661
662 /* utf_strlen ******************************************************************
663
664    Determine number of unicode characters in the utf string.
665
666 *******************************************************************************/
667
668 u4 utf_strlen(utf *u)
669 {
670         char *endpos;                       /* points behind utf string           */
671         char *utf_ptr;                      /* current position in utf text       */
672         u4 len = 0;                         /* number of unicode characters       */
673
674         if (!u) {
675                 *exceptionptr = new_nullpointerexception();
676                 return 0;
677         }
678
679         endpos = UTF_END(u);
680         utf_ptr = u->text;
681
682         while (utf_ptr < endpos) {
683                 len++;
684                 /* next unicode character */
685                 utf_nextu2(&utf_ptr);
686         }
687
688         if (utf_ptr != endpos)
689                 /* string ended abruptly */
690                 throw_cacao_exception_exit(string_java_lang_InternalError,
691                                                                    "Illegal utf8 string");
692
693         return len;
694 }
695
696
697 /* u2_utflength ****************************************************************
698
699    Returns the utf length in bytes of a u2 array.
700
701 *******************************************************************************/
702
703 u4 u2_utflength(u2 *text, u4 u2_length)
704 {
705         u4 result_len = 0;                  /* utf length in bytes                */
706         u2 ch;                              /* current unicode character          */
707         u4 len;
708         
709         for (len = 0; len < u2_length; len++) {
710                 /* next unicode character */
711                 ch = *text++;
712           
713                 /* determine bytes required to store unicode character as utf */
714                 if (ch && (ch < 0x80)) 
715                         result_len++;
716                 else if (ch < 0x800)
717                         result_len += 2;        
718                 else 
719                         result_len += 3;        
720         }
721
722     return result_len;
723 }
724
725
726 /* utf_display *****************************************************************
727
728    Write utf symbol to stdout (for debugging purposes).
729
730 *******************************************************************************/
731
732 void utf_display(utf *u)
733 {
734         char *endpos;                       /* points behind utf string           */
735         char *utf_ptr;                      /* current position in utf text       */
736
737         if (!u) {
738                 printf("NULL");
739                 fflush(stdout);
740                 return;
741         }
742
743         endpos = UTF_END(u);
744         utf_ptr = u->text;
745
746         while (utf_ptr < endpos) {
747                 /* read next unicode character */                
748                 u2 c = utf_nextu2(&utf_ptr);
749                 if (c >= 32 && c <= 127) printf("%c", c);
750                 else printf("?");
751         }
752
753         fflush(stdout);
754 }
755
756
757 /* utf_display_classname *******************************************************
758
759    Write utf symbol to stdout with `/' converted to `.' (for debugging
760    purposes).
761
762 *******************************************************************************/
763
764 void utf_display_classname(utf *u)
765 {
766         char *endpos;                       /* points behind utf string           */
767         char *utf_ptr;                      /* current position in utf text       */
768
769         if (!u) {
770                 printf("NULL");
771                 fflush(stdout);
772                 return;
773         }
774
775         endpos = UTF_END(u);
776         utf_ptr = u->text;
777
778         while (utf_ptr < endpos) {
779                 /* read next unicode character */                
780                 u2 c = utf_nextu2(&utf_ptr);
781                 if (c == '/') c = '.';
782                 if (c >= 32 && c <= 127) printf("%c", c);
783                 else printf("?");
784         }
785
786         fflush(stdout);
787 }
788
789
790 /* utf_sprint ******************************************************************
791         
792    Write utf symbol into c-string (for debugging purposes).
793
794 *******************************************************************************/
795
796 void utf_sprint(char *buffer, utf *u)
797 {
798         char *endpos;                       /* points behind utf string           */
799         char *utf_ptr;                      /* current position in utf text       */
800         u2 pos = 0;                         /* position in c-string               */
801
802         if (!u) {
803                 strcpy(buffer, "NULL");
804                 return;
805         }
806
807         endpos = UTF_END(u);
808         utf_ptr = u->text;
809
810         while (utf_ptr < endpos) 
811                 /* copy next unicode character */       
812                 buffer[pos++] = utf_nextu2(&utf_ptr);
813
814         /* terminate string */
815         buffer[pos] = '\0';
816 }
817
818
819 /* utf_sprint_classname ********************************************************
820         
821    Write utf symbol into c-string with `/' converted to `.' (for debugging
822    purposes).
823
824 *******************************************************************************/
825
826 void utf_sprint_classname(char *buffer, utf *u)
827 {
828         char *endpos;                       /* points behind utf string           */
829         char *utf_ptr;                      /* current position in utf text       */
830         u2 pos = 0;                         /* position in c-string               */
831
832         if (!u) {
833                 strcpy(buffer, "NULL");
834                 return;
835         }
836
837         endpos = UTF_END(u);
838         utf_ptr = u->text;
839
840         while (utf_ptr < endpos) {
841                 /* copy next unicode character */       
842                 u2 c = utf_nextu2(&utf_ptr);
843                 if (c == '/') c = '.';
844                 buffer[pos++] = c;
845         }
846
847         /* terminate string */
848         buffer[pos] = '\0';
849 }
850
851
852 /* utf_strcat ******************************************************************
853         
854    Like libc strcat, but uses an utf8 string.
855
856 *******************************************************************************/
857
858 void utf_strcat(char *buffer, utf *u)
859 {
860         utf_sprint(buffer + strlen(buffer), u);
861 }
862
863
864 /* utf_strcat_classname ********************************************************
865         
866    Like libc strcat, but uses an utf8 string.
867
868 *******************************************************************************/
869
870 void utf_strcat_classname(char *buffer, utf *u)
871 {
872         utf_sprint_classname(buffer + strlen(buffer), u);
873 }
874
875
876 /* utf_fprint ******************************************************************
877         
878    Write utf symbol into file.
879
880 *******************************************************************************/
881
882 void utf_fprint(FILE *file, utf *u)
883 {
884         char *endpos;                       /* points behind utf string           */
885         char *utf_ptr;                      /* current position in utf text       */
886
887         if (!u)
888                 return;
889
890         endpos = UTF_END(u);
891         utf_ptr = u->text;
892
893         while (utf_ptr < endpos) { 
894                 /* read next unicode character */                
895                 u2 c = utf_nextu2(&utf_ptr);                            
896
897                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
898                 else fprintf(file, "?");
899         }
900 }
901
902
903 /* utf_fprint_classname ********************************************************
904         
905    Write utf symbol into file with `/' converted to `.'.
906
907 *******************************************************************************/
908
909 void utf_fprint_classname(FILE *file, utf *u)
910 {
911         char *endpos;                       /* points behind utf string           */
912         char *utf_ptr;                      /* current position in utf text       */
913
914     if (!u)
915                 return;
916
917         endpos = UTF_END(u);
918         utf_ptr = u->text;
919
920         while (utf_ptr < endpos) { 
921                 /* read next unicode character */                
922                 u2 c = utf_nextu2(&utf_ptr);                            
923                 if (c == '/') c = '.';
924
925                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
926                 else fprintf(file, "?");
927         }
928 }
929
930
931 /* is_valid_utf ****************************************************************
932
933    Return true if the given string is a valid UTF-8 string.
934
935    utf_ptr...points to first character
936    end_pos...points after last character
937
938 *******************************************************************************/
939
940 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
941
942 bool is_valid_utf(char *utf_ptr, char *end_pos)
943 {
944         int bytes;
945         int len,i;
946         char c;
947         unsigned long v;
948
949         if (end_pos < utf_ptr) return false;
950         bytes = end_pos - utf_ptr;
951         while (bytes--) {
952                 c = *utf_ptr++;
953
954                 if (!c) return false;                     /* 0x00 is not allowed */
955                 if ((c & 0x80) == 0) continue;            /* ASCII */
956
957                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
958                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
959                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
960                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
961                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
962                 else return false;                        /* invalid leading byte */
963
964                 if (len > 2) return false;                /* Java limitation */
965
966                 v = (unsigned long)c & (0x3f >> len);
967                 
968                 if ((bytes -= len) < 0) return false;     /* missing bytes */
969
970                 for (i = len; i--; ) {
971                         c = *utf_ptr++;
972                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
973                                 return false;
974                         v = (v << 6) | (c & 0x3f);
975                 }
976
977                 if (v == 0) {
978                         if (len != 1) return false;           /* Java special */
979
980                 } else {
981                         /* Sun Java seems to allow overlong UTF-8 encodings */
982                         
983                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
984                                 if (!opt_liberalutf)
985                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
986                                 /* XXX change this to exception? */
987                         }
988                 }
989
990                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
991                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
992
993                 /* even these seem to be allowed */
994                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
995         }
996
997         return true;
998 }
999
1000
1001 /* is_valid_name ***************************************************************
1002
1003    Return true if the given string may be used as a class/field/method
1004    name. (Currently this only disallows empty strings and control
1005    characters.)
1006
1007    NOTE: The string is assumed to have passed is_valid_utf!
1008
1009    utf_ptr...points to first character
1010    end_pos...points after last character
1011
1012 *******************************************************************************/
1013
1014 bool is_valid_name(char *utf_ptr, char *end_pos)
1015 {
1016         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1017
1018         while (utf_ptr < end_pos) {
1019                 unsigned char c = *utf_ptr++;
1020
1021                 if (c < 0x20) return false; /* disallow control characters */
1022                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1023                         return false;
1024         }
1025
1026         return true;
1027 }
1028
1029 bool is_valid_name_utf(utf *u)
1030 {
1031         return is_valid_name(u->text, UTF_END(u));
1032 }
1033
1034
1035 /* utf_show ********************************************************************
1036
1037    Writes the utf symbols in the utfhash to stdout and displays the
1038    number of external hash chains grouped according to the chainlength
1039    (for debugging purposes).
1040
1041 *******************************************************************************/
1042
1043 void utf_show(void)
1044 {
1045
1046 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1047
1048         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1049         u4 max_chainlength = 0;      /* maximum length of the chains */
1050         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1051         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1052         u4 i;
1053
1054         printf ("UTF-HASH:\n");
1055
1056         /* show element of utf-hashtable */
1057         for (i=0; i<utf_hash.size; i++) {
1058                 utf *u = utf_hash.ptr[i];
1059                 if (u) {
1060                         printf ("SLOT %d: ", (int) i);
1061                         while (u) {
1062                                 printf ("'");
1063                                 utf_display (u);
1064                                 printf ("' ");
1065                                 u = u->hashlink;
1066                         }       
1067                         printf ("\n");
1068                 }
1069                 
1070         }
1071
1072         printf ("UTF-HASH: %d slots for %d entries\n", 
1073                         (int) utf_hash.size, (int) utf_hash.entries );
1074
1075
1076         if (utf_hash.entries == 0)
1077                 return;
1078
1079         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1080
1081         for (i=0;i<CHAIN_LIMIT;i++)
1082                 chain_count[i]=0;
1083
1084         /* count numbers of hashchains according to their length */
1085         for (i=0; i<utf_hash.size; i++) {
1086                   
1087                 utf *u = (utf*) utf_hash.ptr[i];
1088                 u4 chain_length = 0;
1089
1090                 /* determine chainlength */
1091                 while (u) {
1092                         u = u->hashlink;
1093                         chain_length++;
1094                 }
1095
1096                 /* update sum of all chainlengths */
1097                 sum_chainlength+=chain_length;
1098
1099                 /* determine the maximum length of the chains */
1100                 if (chain_length>max_chainlength)
1101                         max_chainlength = chain_length;
1102
1103                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1104                 if (chain_length>=CHAIN_LIMIT) {
1105                         beyond_limit+=chain_length;
1106                         chain_length=CHAIN_LIMIT-1;
1107                 }
1108
1109                 /* update number of hashchains of current length */
1110                 chain_count[chain_length]++;
1111         }
1112
1113         /* display results */  
1114         for (i=1;i<CHAIN_LIMIT-1;i++) 
1115                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1116           
1117         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1118
1119
1120         printf("max. chainlength:%5d\n",max_chainlength);
1121
1122         /* avg. chainlength = sum of chainlengths / number of chains */
1123         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1124 }
1125
1126
1127 /*
1128  * These are local overrides for various environment variables in Emacs.
1129  * Please do not remove this and leave it at the end of the file, where
1130  * Emacs will automagically detect them.
1131  * ---------------------------------------------------------------------
1132  * Local variables:
1133  * mode: c
1134  * indent-tabs-mode: t
1135  * c-basic-offset: 4
1136  * tab-width: 4
1137  * End:
1138  */