* src/vm/utf8.c (utf_java_lang_AbstractMethodError): Added.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32                         Edwin Steiner
33
34    $Id: utf8.c 5053 2006-06-28 19:11:20Z twisti $
35
36 */
37
38
39 #include "config.h"
40
41 #include <string.h>
42 #include <assert.h>
43
44 #include "vm/types.h"
45
46 #include "mm/memory.h"
47
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/threads.h"
50 #endif
51
52 #include "vm/builtin.h"
53 #include "vm/exceptions.h"
54 #include "vm/hashtable.h"
55 #include "vm/options.h"
56 #include "vm/statistics.h"
57 #include "vm/stringlocal.h"
58 #include "vm/utf8.h"
59
60
61 /* global variables ***********************************************************/
62
63 /* hashsize must be power of 2 */
64
65 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
66
67 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
68
69
70 /* utf-symbols for pointer comparison of frequently used strings **************/
71
72 utf *utf_java_lang_Object;
73
74 utf *utf_java_lang_Class;
75 utf *utf_java_lang_ClassLoader;
76 utf *utf_java_lang_Cloneable;
77 utf *utf_java_lang_SecurityManager;
78 utf *utf_java_lang_String;
79 utf *utf_java_lang_System;
80 utf *utf_java_lang_ThreadGroup;
81 utf *utf_java_io_Serializable;
82
83 utf *utf_java_lang_Throwable;
84 utf *utf_java_lang_VMThrowable;
85 utf *utf_java_lang_Error;
86 utf *utf_java_lang_AbstractMethodError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoClassDefFoundError;
89 utf *utf_java_lang_NoSuchMethodError;
90 utf *utf_java_lang_OutOfMemoryError;
91
92 utf *utf_java_lang_Exception;
93 utf *utf_java_lang_ClassNotFoundException;
94 utf *utf_java_lang_IllegalArgumentException;
95 utf *utf_java_lang_IllegalMonitorStateException;
96
97 utf *utf_java_lang_NullPointerException;
98
99 utf* utf_java_lang_Void;
100 utf* utf_java_lang_Boolean;
101 utf* utf_java_lang_Byte;
102 utf* utf_java_lang_Character;
103 utf* utf_java_lang_Short;
104 utf* utf_java_lang_Integer;
105 utf* utf_java_lang_Long;
106 utf* utf_java_lang_Float;
107 utf* utf_java_lang_Double;
108
109 utf *utf_java_lang_StackTraceElement;
110 utf *utf_java_lang_reflect_Constructor;
111 utf *utf_java_lang_reflect_Field;
112 utf *utf_java_lang_reflect_Method;
113 utf *utf_java_util_Vector;
114
115 utf *utf_InnerClasses;                  /* InnerClasses                       */
116 utf *utf_ConstantValue;                 /* ConstantValue                      */
117 utf *utf_Code;                          /* Code                               */
118 utf *utf_Exceptions;                    /* Exceptions                         */
119 utf *utf_LineNumberTable;               /* LineNumberTable                    */
120 utf *utf_SourceFile;                    /* SourceFile                         */
121
122 utf *utf_init;                          /* <init>                             */
123 utf *utf_clinit;                        /* <clinit>                           */
124 utf *utf_clone;                         /* clone                              */
125 utf *utf_finalize;                      /* finalize                           */
126 utf *utf_run;                           /* run                                */
127
128 utf *utf_add;                           /* add                                */
129 utf *utf_remove;                        /* remove                             */
130 utf *utf_put;                           /* put                                */
131 utf *utf_get;                           /* get                                */
132 utf *utf_value;                         /* value                              */
133
134 utf *utf_fillInStackTrace;
135 utf *utf_getSystemClassLoader;
136 utf *utf_loadClass;
137 utf *utf_printStackTrace;
138
139 utf *utf_Z;                             /* Z                                  */
140 utf *utf_B;                             /* B                                  */
141 utf *utf_C;                             /* C                                  */
142 utf *utf_S;                             /* S                                  */
143 utf *utf_I;                             /* I                                  */
144 utf *utf_J;                             /* J                                  */
145 utf *utf_F;                             /* F                                  */
146 utf *utf_D;                             /* D                                  */
147
148 utf *utf_void__void;                    /* ()V                                */
149 utf *utf_boolean__void;                 /* (Z)V                               */
150 utf *utf_byte__void;                    /* (B)V                               */
151 utf *utf_char__void;                    /* (C)V                               */
152 utf *utf_short__void;                   /* (S)V                               */
153 utf *utf_int__void;                     /* (I)V                               */
154 utf *utf_long__void;                    /* (J)V                               */
155 utf *utf_float__void;                   /* (F)V                               */
156 utf *utf_double__void;                  /* (D)V                               */
157
158 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
159 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
160 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
161 utf *utf_java_lang_Object__java_lang_Object;
162 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
163 utf *utf_java_lang_String__java_lang_Class;
164 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
165
166 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
167 utf *utf_null;
168 utf *array_packagename;
169
170
171 /* utf_init ********************************************************************
172
173    Initializes the utf8 subsystem.
174
175 *******************************************************************************/
176
177 bool utf8_init(void)
178 {
179         /* create utf8 hashtable */
180
181         hashtable_utf = NEW(hashtable);
182
183         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
184
185 #if defined(ENABLE_STATISTICS)
186         if (opt_stat)
187                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
188 #endif
189
190         /* create utf-symbols for pointer comparison of frequently used strings */
191
192         utf_java_lang_Object           = utf_new_char("java/lang/Object");
193
194         utf_java_lang_Class            = utf_new_char("java/lang/Class");
195         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
196         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
197         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
198         utf_java_lang_String           = utf_new_char("java/lang/String");
199         utf_java_lang_System           = utf_new_char("java/lang/System");
200         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
201         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
202
203         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
204         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
205         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
206
207         utf_java_lang_AbstractMethodError =
208                 utf_new_char(string_java_lang_AbstractMethodError);
209
210         utf_java_lang_LinkageError =
211                 utf_new_char(string_java_lang_LinkageError);
212
213         utf_java_lang_NoClassDefFoundError =
214                 utf_new_char(string_java_lang_NoClassDefFoundError);
215
216         utf_java_lang_NoSuchMethodError =
217                 utf_new_char(string_java_lang_NoSuchMethodError);
218
219         utf_java_lang_OutOfMemoryError =
220                 utf_new_char(string_java_lang_OutOfMemoryError);
221
222         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
223
224         utf_java_lang_ClassNotFoundException =
225                 utf_new_char(string_java_lang_ClassNotFoundException);
226
227         utf_java_lang_IllegalArgumentException =
228                 utf_new_char(string_java_lang_IllegalArgumentException);
229
230         utf_java_lang_IllegalMonitorStateException =
231                 utf_new_char(string_java_lang_IllegalMonitorStateException);
232
233         utf_java_lang_NullPointerException =
234                 utf_new_char(string_java_lang_NullPointerException);
235
236         utf_java_lang_Void             = utf_new_char("java/lang/Void");
237         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
238         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
239         utf_java_lang_Character        = utf_new_char("java/lang/Character");
240         utf_java_lang_Short            = utf_new_char("java/lang/Short");
241         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
242         utf_java_lang_Long             = utf_new_char("java/lang/Long");
243         utf_java_lang_Float            = utf_new_char("java/lang/Float");
244         utf_java_lang_Double           = utf_new_char("java/lang/Double");
245
246         utf_java_lang_StackTraceElement =
247                 utf_new_char("java/lang/StackTraceElement");
248
249         utf_java_lang_reflect_Constructor =
250                 utf_new_char("java/lang/reflect/Constructor");
251
252         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
253         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
254         utf_java_util_Vector           = utf_new_char("java/util/Vector");
255
256         utf_InnerClasses               = utf_new_char("InnerClasses");
257         utf_ConstantValue              = utf_new_char("ConstantValue");
258         utf_Code                       = utf_new_char("Code");
259         utf_Exceptions                 = utf_new_char("Exceptions");
260         utf_LineNumberTable            = utf_new_char("LineNumberTable");
261         utf_SourceFile                 = utf_new_char("SourceFile");
262
263         utf_init                           = utf_new_char("<init>");
264         utf_clinit                         = utf_new_char("<clinit>");
265         utf_clone                      = utf_new_char("clone");
266         utf_finalize                   = utf_new_char("finalize");
267         utf_run                        = utf_new_char("run");
268
269         utf_add                        = utf_new_char("add");
270         utf_remove                     = utf_new_char("remove");
271         utf_put                        = utf_new_char("put");
272         utf_get                        = utf_new_char("get");
273         utf_value                      = utf_new_char("value");
274
275         utf_printStackTrace            = utf_new_char("printStackTrace");
276         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
277         utf_loadClass                  = utf_new_char("loadClass");
278         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
279
280         utf_Z                          = utf_new_char("Z");
281         utf_B                          = utf_new_char("B");
282         utf_C                          = utf_new_char("C");
283         utf_S                          = utf_new_char("S");
284         utf_I                          = utf_new_char("I");
285         utf_J                          = utf_new_char("J");
286         utf_F                          = utf_new_char("F");
287         utf_D                          = utf_new_char("D");
288
289         utf_void__void                 = utf_new_char("()V");
290         utf_boolean__void              = utf_new_char("(Z)V");
291         utf_byte__void                 = utf_new_char("(B)V");
292         utf_char__void                 = utf_new_char("(C)V");
293         utf_short__void                = utf_new_char("(S)V");
294         utf_int__void                  = utf_new_char("(I)V");
295         utf_long__void                 = utf_new_char("(J)V");
296         utf_float__void                = utf_new_char("(F)V");
297         utf_double__void               = utf_new_char("(D)V");
298         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
299         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
300
301         utf_void__java_lang_ClassLoader =
302                 utf_new_char("()Ljava/lang/ClassLoader;");
303
304         utf_java_lang_Object__java_lang_Object =
305                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
306
307         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
308
309         utf_java_lang_String__java_lang_Class =
310                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
311
312         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
313
314         utf_null                       = utf_new_char("null");
315         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
316         array_packagename              = utf_new_char("\t<the array package>");
317
318         /* everything's ok */
319
320         return true;
321 }
322
323
324 /* utf_hashkey *****************************************************************
325
326    The hashkey is computed from the utf-text by using up to 8
327    characters.  For utf-symbols longer than 15 characters 3 characters
328    are taken from the beginning and the end, 2 characters are taken
329    from the middle.
330
331 *******************************************************************************/
332
333 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
334 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
335
336 u4 utf_hashkey(const char *text, u4 length)
337 {
338         const char *start_pos = text;       /* pointer to utf text                */
339         u4 a;
340
341         switch (length) {
342         case 0: /* empty string */
343                 return 0;
344
345         case 1: return fbs(0);
346         case 2: return fbs(0) ^ nbs(3);
347         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
348         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
349         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
350         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
351         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
352         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
353
354         case 9:
355                 a = fbs(0);
356                 a ^= nbs(1);
357                 a ^= nbs(2);
358                 text++;
359                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
360
361         case 10:
362                 a = fbs(0);
363                 text++;
364                 a ^= nbs(2);
365                 a ^= nbs(3);
366                 a ^= nbs(4);
367                 text++;
368                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
369
370         case 11:
371                 a = fbs(0);
372                 text++;
373                 a ^= nbs(2);
374                 a ^= nbs(3);
375                 a ^= nbs(4);
376                 text++;
377                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
378
379         case 12:
380                 a = fbs(0);
381                 text += 2;
382                 a ^= nbs(2);
383                 a ^= nbs(3);
384                 text++;
385                 a ^= nbs(5);
386                 a ^= nbs(6);
387                 a ^= nbs(7);
388                 text++;
389                 return a ^ nbs(9) ^ nbs(10);
390
391         case 13:
392                 a = fbs(0);
393                 a ^= nbs(1);
394                 text++;
395                 a ^= nbs(3);
396                 a ^= nbs(4);
397                 text += 2;      
398                 a ^= nbs(7);
399                 a ^= nbs(8);
400                 text += 2;
401                 return a ^ nbs(9) ^ nbs(10);
402
403         case 14:
404                 a = fbs(0);
405                 text += 2;      
406                 a ^= nbs(3);
407                 a ^= nbs(4);
408                 text += 2;      
409                 a ^= nbs(7);
410                 a ^= nbs(8);
411                 text += 2;
412                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
413
414         case 15:
415                 a = fbs(0);
416                 text += 2;      
417                 a ^= nbs(3);
418                 a ^= nbs(4);
419                 text += 2;      
420                 a ^= nbs(7);
421                 a ^= nbs(8);
422                 text += 2;
423                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
424
425         default:  /* 3 characters from beginning */
426                 a = fbs(0);
427                 text += 2;
428                 a ^= nbs(3);
429                 a ^= nbs(4);
430
431                 /* 2 characters from middle */
432                 text = start_pos + (length / 2);
433                 a ^= fbs(5);
434                 text += 2;
435                 a ^= nbs(6);    
436
437                 /* 3 characters from end */
438                 text = start_pos + length - 4;
439
440                 a ^= fbs(7);
441                 text++;
442
443                 return a ^ nbs(10) ^ nbs(11);
444     }
445 }
446
447 /* utf_full_hashkey ************************************************************
448
449    This function computes a hash value using all bytes in the string.
450
451    The algorithm is the "One-at-a-time" algorithm as published
452    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
453
454 *******************************************************************************/
455
456 u4 utf_full_hashkey(const char *text, u4 length)
457 {
458         register const unsigned char *p = (const unsigned char *) text;
459         register u4 hash;
460         register u4 i;
461
462         hash = 0;
463         for (i=length; i--;)
464         {
465             hash += *p++;
466             hash += (hash << 10);
467             hash ^= (hash >> 6);
468         }
469         hash += (hash << 3);
470         hash ^= (hash >> 11);
471         hash += (hash << 15);
472
473         return hash;
474 }
475
476 /* unicode_hashkey *************************************************************
477
478    Compute the hashkey of a unicode string.
479
480 *******************************************************************************/
481
482 u4 unicode_hashkey(u2 *text, u2 len)
483 {
484         return utf_hashkey((char *) text, len);
485 }
486
487
488 /* utf_new *********************************************************************
489
490    Creates a new utf-symbol, the text of the symbol is passed as a
491    u1-array. The function searches the utf-hashtable for a utf-symbol
492    with this text. On success the element returned, otherwise a new
493    hashtable element is created.
494
495    If the number of entries in the hashtable exceeds twice the size of
496    the hashtable slots a reorganization of the hashtable is done and
497    the utf symbols are copied to a new hashtable with doubled size.
498
499 *******************************************************************************/
500
501 utf *utf_new(const char *text, u2 length)
502 {
503         u4 key;                             /* hashkey computed from utf-text     */
504         u4 slot;                            /* slot in hashtable                  */
505         utf *u;                             /* hashtable element                  */
506         u2 i;
507
508 #if defined(ENABLE_THREADS)
509         builtin_monitorenter(hashtable_utf->header);
510 #endif
511
512 #if defined(ENABLE_STATISTICS)
513         if (opt_stat)
514                 count_utf_new++;
515 #endif
516
517         key  = utf_hashkey(text, length);
518         slot = key & (hashtable_utf->size - 1);
519         u    = hashtable_utf->ptr[slot];
520
521         /* search external hash chain for utf-symbol */
522
523         while (u) {
524                 if (u->blength == length) {
525                         /* compare text of hashtable elements */
526
527                         for (i = 0; i < length; i++)
528                                 if (text[i] != u->text[i])
529                                         goto nomatch;
530                         
531 #if defined(ENABLE_STATISTICS)
532                         if (opt_stat)
533                                 count_utf_new_found++;
534 #endif
535
536                         /* symbol found in hashtable */
537
538 #if defined(ENABLE_THREADS)
539                         builtin_monitorexit(hashtable_utf->header);
540 #endif
541
542                         return u;
543                 }
544
545         nomatch:
546                 u = u->hashlink; /* next element in external chain */
547         }
548
549 #if defined(ENABLE_STATISTICS)
550         if (opt_stat)
551                 count_utf_len += sizeof(utf) + length + 1;
552 #endif
553
554         /* location in hashtable found, create new utf element */
555         u = NEW(utf);
556         u->blength  = length;               /* length in bytes of utfstring       */
557         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
558         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
559
560         memcpy(u->text, text, length);      /* copy utf-text                      */
561         u->text[length] = '\0';
562
563         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
564         hashtable_utf->entries++;           /* update number of entries           */
565
566         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
567
568         /* reorganization of hashtable, average length of the external
569            chains is approx. 2 */
570
571                 hashtable *newhash;                              /* the new hashtable */
572                 u4         i;
573                 utf       *u;
574                 utf       *nextu;
575                 u4         slot;
576
577                 /* create new hashtable, double the size */
578
579                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
580
581 #if defined(ENABLE_STATISTICS)
582                 if (opt_stat)
583                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
584 #endif
585
586                 /* transfer elements to new hashtable */
587
588                 for (i = 0; i < hashtable_utf->size; i++) {
589                         u = hashtable_utf->ptr[i];
590
591                         while (u) {
592                                 nextu = u->hashlink;
593                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
594                                                 
595                                 u->hashlink = (utf *) newhash->ptr[slot];
596                                 newhash->ptr[slot] = u;
597
598                                 /* follow link in external hash chain */
599
600                                 u = nextu;
601                         }
602                 }
603         
604                 /* dispose old table */
605
606                 hashtable_free(hashtable_utf);
607
608                 hashtable_utf = newhash;
609         }
610
611 #if defined(ENABLE_THREADS)
612         builtin_monitorexit(hashtable_utf->header);
613 #endif
614
615         return u;
616 }
617
618
619 /* utf_new_u2 ******************************************************************
620
621    Make utf symbol from u2 array, if isclassname is true '.' is
622    replaced by '/'.
623
624 *******************************************************************************/
625
626 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
627 {
628         char *buffer;                   /* memory buffer for  unicode characters  */
629         char *pos;                      /* pointer to current position in buffer  */
630         u4 left;                        /* unicode characters left                */
631         u4 buflength;                   /* utf length in bytes of the u2 array    */
632         utf *result;                    /* resulting utf-string                   */
633         int i;          
634
635         /* determine utf length in bytes and allocate memory */
636
637         buflength = u2_utflength(unicode_pos, unicode_length); 
638         buffer    = MNEW(char, buflength);
639  
640         left = buflength;
641         pos  = buffer;
642
643         for (i = 0; i++ < unicode_length; unicode_pos++) {
644                 /* next unicode character */
645                 u2 c = *unicode_pos;
646                 
647                 if ((c != 0) && (c < 0x80)) {
648                         /* 1 character */       
649                         left--;
650                 if ((int) left < 0) break;
651                         /* convert classname */
652                         if (isclassname && c == '.')
653                                 *pos++ = '/';
654                         else
655                                 *pos++ = (char) c;
656
657                 } else if (c < 0x800) {             
658                         /* 2 characters */                              
659                 unsigned char high = c >> 6;
660                 unsigned char low  = c & 0x3F;
661                         left = left - 2;
662                 if ((int) left < 0) break;
663                 *pos++ = high | 0xC0; 
664                 *pos++ = low  | 0x80;     
665
666                 } else {         
667                 /* 3 characters */                              
668                 char low  = c & 0x3f;
669                 char mid  = (c >> 6) & 0x3F;
670                 char high = c >> 12;
671                         left = left - 3;
672                 if ((int) left < 0) break;
673                 *pos++ = high | 0xE0; 
674                 *pos++ = mid  | 0x80;  
675                 *pos++ = low  | 0x80;   
676                 }
677         }
678         
679         /* insert utf-string into symbol-table */
680         result = utf_new(buffer,buflength);
681
682         MFREE(buffer, char, buflength);
683
684         return result;
685 }
686
687
688 /* utf_new_char ****************************************************************
689
690    Creates a new utf symbol, the text for this symbol is passed as a
691    c-string ( = char* ).
692
693 *******************************************************************************/
694
695 utf *utf_new_char(const char *text)
696 {
697         return utf_new(text, strlen(text));
698 }
699
700
701 /* utf_new_char_classname ******************************************************
702
703    Creates a new utf symbol, the text for this symbol is passed as a
704    c-string ( = char* ) "." characters are going to be replaced by
705    "/". Since the above function is used often, this is a separte
706    function, instead of an if.
707
708 *******************************************************************************/
709
710 utf *utf_new_char_classname(const char *text)
711 {
712         if (strchr(text, '.')) {
713                 char *txt = strdup(text);
714                 char *end = txt + strlen(txt);
715                 char *c;
716                 utf *tmpRes;
717
718                 for (c = txt; c < end; c++)
719                         if (*c == '.') *c = '/';
720
721                 tmpRes = utf_new(txt, strlen(txt));
722                 FREE(txt, 0);
723
724                 return tmpRes;
725
726         } else
727                 return utf_new(text, strlen(text));
728 }
729
730
731 /* utf_nextu2 ******************************************************************
732
733    Read the next unicode character from the utf string and increment
734    the utf-string pointer accordingly.
735
736 *******************************************************************************/
737
738 u2 utf_nextu2(char **utf_ptr)
739 {
740     /* uncompressed unicode character */
741     u2 unicode_char = 0;
742     /* current position in utf text */  
743     unsigned char *utf = (unsigned char *) (*utf_ptr);
744     /* bytes representing the unicode character */
745     unsigned char ch1, ch2, ch3;
746     /* number of bytes used to represent the unicode character */
747     int len = 0;
748         
749     switch ((ch1 = utf[0]) >> 4) {
750         default: /* 1 byte */
751                 (*utf_ptr)++;
752                 return (u2) ch1;
753         case 0xC: 
754         case 0xD: /* 2 bytes */
755                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
756                         unsigned char high = ch1 & 0x1F;
757                         unsigned char low  = ch2 & 0x3F;
758                         unicode_char = (high << 6) + low;
759                         len = 2;
760                 }
761                 break;
762
763         case 0xE: /* 2 or 3 bytes */
764                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
765                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
766                                 unsigned char low  = ch3 & 0x3f;
767                                 unsigned char mid  = ch2 & 0x3f;
768                                 unsigned char high = ch1 & 0x0f;
769                                 unicode_char = (((high << 6) + mid) << 6) + low;
770                                 len = 3;
771                         } else
772                                 len = 2;                                           
773                 }
774                 break;
775     }
776
777     /* update position in utf-text */
778     *utf_ptr = (char *) (utf + len);
779
780     return unicode_char;
781 }
782
783
784 /* utf_bytes *******************************************************************
785
786    Determine number of bytes (aka. octets) in the utf string.
787
788    IN:
789       u............utf string
790
791    OUT:
792       The number of octets of this utf string.
793           There is _no_ terminating zero included in this count.
794
795 *******************************************************************************/
796
797 u4 utf_bytes(utf *u)
798 {
799         return u->blength;
800 }
801
802 /* utf_get_number_of_u2s_for_buffer ********************************************
803
804    Determine number of UTF-16 u2s in the given UTF-8 buffer
805
806    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
807    to an array of u2s (UTF-16) and want to know how many of them you will get.
808    All other uses of this function are probably wrong.
809
810    IN:
811       buffer........points to first char in buffer
812           blength.......number of _bytes_ in the buffer
813
814    OUT:
815       the number of u2s needed to hold this string in UTF-16 encoding.
816           There is _no_ terminating zero included in this count.
817
818    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
819    exception.
820
821 *******************************************************************************/
822
823 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
824 {
825         const char *endpos;                 /* points behind utf string           */
826         const char *utf_ptr;                /* current position in utf text       */
827         u4 len = 0;                         /* number of unicode characters       */
828
829         utf_ptr = buffer;
830         endpos = utf_ptr + blength;
831
832         while (utf_ptr < endpos) {
833                 len++;
834                 /* next unicode character */
835                 utf_nextu2((char **)&utf_ptr);
836         }
837
838         assert(utf_ptr == endpos);
839
840         return len;
841 }
842
843
844 /* utf_get_number_of_u2s *******************************************************
845
846    Determine number of UTF-16 u2s in the utf string.
847
848    CAUTION: Use this function *only* when you want to convert a utf string
849    to an array of u2s and want to know how many of them you will get.
850    All other uses of this function are probably wrong.
851
852    IN:
853       u............utf string
854
855    OUT:
856       the number of u2s needed to hold this string in UTF-16 encoding.
857           There is _no_ terminating zero included in this count.
858           XXX 0 if a NullPointerException has been thrown (see below)
859
860 *******************************************************************************/
861
862 u4 utf_get_number_of_u2s(utf *u)
863 {
864         char *endpos;                       /* points behind utf string           */
865         char *utf_ptr;                      /* current position in utf text       */
866         u4 len = 0;                         /* number of unicode characters       */
867
868         /* XXX this is probably not checked by most callers! Review this after */
869         /* the invalid uses of this function have been eliminated */
870         if (!u) {
871                 exceptions_throw_nullpointerexception();
872                 return 0;
873         }
874
875         endpos = UTF_END(u);
876         utf_ptr = u->text;
877
878         while (utf_ptr < endpos) {
879                 len++;
880                 /* next unicode character */
881                 utf_nextu2(&utf_ptr);
882         }
883
884         if (utf_ptr != endpos)
885                 /* string ended abruptly */
886                 throw_cacao_exception_exit(string_java_lang_InternalError,
887                                                                    "Illegal utf8 string");
888
889         return len;
890 }
891
892
893 /* u2_utflength ****************************************************************
894
895    Returns the utf length in bytes of a u2 array.
896
897 *******************************************************************************/
898
899 u4 u2_utflength(u2 *text, u4 u2_length)
900 {
901         u4 result_len = 0;                  /* utf length in bytes                */
902         u2 ch;                              /* current unicode character          */
903         u4 len;
904         
905         for (len = 0; len < u2_length; len++) {
906                 /* next unicode character */
907                 ch = *text++;
908           
909                 /* determine bytes required to store unicode character as utf */
910                 if (ch && (ch < 0x80)) 
911                         result_len++;
912                 else if (ch < 0x800)
913                         result_len += 2;        
914                 else 
915                         result_len += 3;        
916         }
917
918     return result_len;
919 }
920
921
922 /* utf_copy ********************************************************************
923
924    Copy the given utf string byte-for-byte to a buffer.
925
926    IN:
927       buffer.......the buffer
928           u............the utf string
929
930 *******************************************************************************/
931
932 void utf_copy(char *buffer, utf *u)
933 {
934         /* our utf strings are zero-terminated (done by utf_new) */
935         MCOPY(buffer, u->text, char, u->blength + 1);
936 }
937
938
939 /* utf_cat *********************************************************************
940
941    Append the given utf string byte-for-byte to a buffer.
942
943    IN:
944       buffer.......the buffer
945           u............the utf string
946
947 *******************************************************************************/
948
949 void utf_cat(char *buffer, utf *u)
950 {
951         /* our utf strings are zero-terminated (done by utf_new) */
952         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
953 }
954
955
956 /* utf_copy_classname **********************************************************
957
958    Copy the given utf classname byte-for-byte to a buffer.
959    '/' is replaced by '.'
960
961    IN:
962       buffer.......the buffer
963           u............the utf string
964
965 *******************************************************************************/
966
967 void utf_copy_classname(char *buffer, utf *u)
968 {
969         char *bufptr;
970         char *srcptr;
971         char *endptr;
972         char ch;
973
974         bufptr = buffer;
975         srcptr = u->text;
976         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
977
978         while (srcptr != endptr) {
979                 ch = *srcptr++;
980                 if (ch == '/')
981                         ch = '.';
982                 *bufptr++ = ch;
983         }
984 }
985
986
987 /* utf_cat *********************************************************************
988
989    Append the given utf classname byte-for-byte to a buffer.
990    '/' is replaced by '.'
991
992    IN:
993       buffer.......the buffer
994           u............the utf string
995
996 *******************************************************************************/
997
998 void utf_cat_classname(char *buffer, utf *u)
999 {
1000         utf_copy_classname(buffer + strlen(buffer), u);
1001 }
1002
1003 /* utf_display_printable_ascii *************************************************
1004
1005    Write utf symbol to stdout (for debugging purposes).
1006    Non-printable and non-ASCII characters are printed as '?'.
1007
1008 *******************************************************************************/
1009
1010 void utf_display_printable_ascii(utf *u)
1011 {
1012         char *endpos;                       /* points behind utf string           */
1013         char *utf_ptr;                      /* current position in utf text       */
1014
1015         if (u == NULL) {
1016                 printf("NULL");
1017                 fflush(stdout);
1018                 return;
1019         }
1020
1021         endpos = UTF_END(u);
1022         utf_ptr = u->text;
1023
1024         while (utf_ptr < endpos) {
1025                 /* read next unicode character */
1026
1027                 u2 c = utf_nextu2(&utf_ptr);
1028
1029                 if ((c >= 32) && (c <= 127))
1030                         printf("%c", c);
1031                 else
1032                         printf("?");
1033         }
1034
1035         fflush(stdout);
1036 }
1037
1038
1039 /* utf_display_printable_ascii_classname ***************************************
1040
1041    Write utf symbol to stdout with `/' converted to `.' (for debugging
1042    purposes).
1043    Non-printable and non-ASCII characters are printed as '?'.
1044
1045 *******************************************************************************/
1046
1047 void utf_display_printable_ascii_classname(utf *u)
1048 {
1049         char *endpos;                       /* points behind utf string           */
1050         char *utf_ptr;                      /* current position in utf text       */
1051
1052         if (u == NULL) {
1053                 printf("NULL");
1054                 fflush(stdout);
1055                 return;
1056         }
1057
1058         endpos = UTF_END(u);
1059         utf_ptr = u->text;
1060
1061         while (utf_ptr < endpos) {
1062                 /* read next unicode character */
1063
1064                 u2 c = utf_nextu2(&utf_ptr);
1065
1066                 if (c == '/')
1067                         c = '.';
1068
1069                 if ((c >= 32) && (c <= 127))
1070                         printf("%c", c);
1071                 else
1072                         printf("?");
1073         }
1074
1075         fflush(stdout);
1076 }
1077
1078
1079 /* utf_sprint_convert_to_latin1 ************************************************
1080         
1081    Write utf symbol into c-string (for debugging purposes).
1082    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1083    invalid results.
1084
1085 *******************************************************************************/
1086
1087 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1088 {
1089         char *endpos;                       /* points behind utf string           */
1090         char *utf_ptr;                      /* current position in utf text       */
1091         u2 pos = 0;                         /* position in c-string               */
1092
1093         if (!u) {
1094                 strcpy(buffer, "NULL");
1095                 return;
1096         }
1097
1098         endpos = UTF_END(u);
1099         utf_ptr = u->text;
1100
1101         while (utf_ptr < endpos) 
1102                 /* copy next unicode character */       
1103                 buffer[pos++] = utf_nextu2(&utf_ptr);
1104
1105         /* terminate string */
1106         buffer[pos] = '\0';
1107 }
1108
1109
1110 /* utf_sprint_convert_to_latin1_classname **************************************
1111         
1112    Write utf symbol into c-string with `/' converted to `.' (for debugging
1113    purposes).
1114    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1115    invalid results.
1116
1117 *******************************************************************************/
1118
1119 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1120 {
1121         char *endpos;                       /* points behind utf string           */
1122         char *utf_ptr;                      /* current position in utf text       */
1123         u2 pos = 0;                         /* position in c-string               */
1124
1125         if (!u) {
1126                 strcpy(buffer, "NULL");
1127                 return;
1128         }
1129
1130         endpos = UTF_END(u);
1131         utf_ptr = u->text;
1132
1133         while (utf_ptr < endpos) {
1134                 /* copy next unicode character */       
1135                 u2 c = utf_nextu2(&utf_ptr);
1136                 if (c == '/') c = '.';
1137                 buffer[pos++] = c;
1138         }
1139
1140         /* terminate string */
1141         buffer[pos] = '\0';
1142 }
1143
1144
1145 /* utf_strcat_convert_to_latin1 ************************************************
1146         
1147    Like libc strcat, but uses an utf8 string.
1148    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1149    invalid results.
1150
1151 *******************************************************************************/
1152
1153 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1154 {
1155         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1156 }
1157
1158
1159 /* utf_strcat_convert_to_latin1_classname **************************************
1160         
1161    Like libc strcat, but uses an utf8 string.
1162    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1163    invalid results.
1164
1165 *******************************************************************************/
1166
1167 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1168 {
1169         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1170 }
1171
1172
1173 /* utf_fprint_printable_ascii **************************************************
1174         
1175    Write utf symbol into file.
1176    Non-printable and non-ASCII characters are printed as '?'.
1177
1178 *******************************************************************************/
1179
1180 void utf_fprint_printable_ascii(FILE *file, utf *u)
1181 {
1182         char *endpos;                       /* points behind utf string           */
1183         char *utf_ptr;                      /* current position in utf text       */
1184
1185         if (!u)
1186                 return;
1187
1188         endpos = UTF_END(u);
1189         utf_ptr = u->text;
1190
1191         while (utf_ptr < endpos) { 
1192                 /* read next unicode character */                
1193                 u2 c = utf_nextu2(&utf_ptr);                            
1194
1195                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1196                 else fprintf(file, "?");
1197         }
1198 }
1199
1200
1201 /* utf_fprint_printable_ascii_classname ****************************************
1202         
1203    Write utf symbol into file with `/' converted to `.'.
1204    Non-printable and non-ASCII characters are printed as '?'.
1205
1206 *******************************************************************************/
1207
1208 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1209 {
1210         char *endpos;                       /* points behind utf string           */
1211         char *utf_ptr;                      /* current position in utf text       */
1212
1213     if (!u)
1214                 return;
1215
1216         endpos = UTF_END(u);
1217         utf_ptr = u->text;
1218
1219         while (utf_ptr < endpos) { 
1220                 /* read next unicode character */                
1221                 u2 c = utf_nextu2(&utf_ptr);                            
1222                 if (c == '/') c = '.';
1223
1224                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1225                 else fprintf(file, "?");
1226         }
1227 }
1228
1229
1230 /* is_valid_utf ****************************************************************
1231
1232    Return true if the given string is a valid UTF-8 string.
1233
1234    utf_ptr...points to first character
1235    end_pos...points after last character
1236
1237 *******************************************************************************/
1238
1239 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1240
1241 bool is_valid_utf(char *utf_ptr, char *end_pos)
1242 {
1243         int bytes;
1244         int len,i;
1245         char c;
1246         unsigned long v;
1247
1248         if (end_pos < utf_ptr) return false;
1249         bytes = end_pos - utf_ptr;
1250         while (bytes--) {
1251                 c = *utf_ptr++;
1252
1253                 if (!c) return false;                     /* 0x00 is not allowed */
1254                 if ((c & 0x80) == 0) continue;            /* ASCII */
1255
1256                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1257                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1258                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1259                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1260                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1261                 else return false;                        /* invalid leading byte */
1262
1263                 if (len > 2) return false;                /* Java limitation */
1264
1265                 v = (unsigned long)c & (0x3f >> len);
1266                 
1267                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1268
1269                 for (i = len; i--; ) {
1270                         c = *utf_ptr++;
1271                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1272                                 return false;
1273                         v = (v << 6) | (c & 0x3f);
1274                 }
1275
1276                 if (v == 0) {
1277                         if (len != 1) return false;           /* Java special */
1278
1279                 } else {
1280                         /* Sun Java seems to allow overlong UTF-8 encodings */
1281                         
1282                         /* if (v < min_codepoint[len]) */
1283                                 /* XXX throw exception? */
1284                 }
1285
1286                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1287                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1288
1289                 /* even these seem to be allowed */
1290                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1291         }
1292
1293         return true;
1294 }
1295
1296
1297 /* is_valid_name ***************************************************************
1298
1299    Return true if the given string may be used as a class/field/method
1300    name. (Currently this only disallows empty strings and control
1301    characters.)
1302
1303    NOTE: The string is assumed to have passed is_valid_utf!
1304
1305    utf_ptr...points to first character
1306    end_pos...points after last character
1307
1308 *******************************************************************************/
1309
1310 bool is_valid_name(char *utf_ptr, char *end_pos)
1311 {
1312         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1313
1314         while (utf_ptr < end_pos) {
1315                 unsigned char c = *utf_ptr++;
1316
1317                 if (c < 0x20) return false; /* disallow control characters */
1318                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1319                         return false;
1320         }
1321
1322         return true;
1323 }
1324
1325 bool is_valid_name_utf(utf *u)
1326 {
1327         return is_valid_name(u->text, UTF_END(u));
1328 }
1329
1330
1331 /* utf_show ********************************************************************
1332
1333    Writes the utf symbols in the utfhash to stdout and displays the
1334    number of external hash chains grouped according to the chainlength
1335    (for debugging purposes).
1336
1337 *******************************************************************************/
1338
1339 #if !defined(NDEBUG)
1340 void utf_show(void)
1341 {
1342
1343 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1344
1345         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1346         u4 max_chainlength = 0;      /* maximum length of the chains */
1347         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1348         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1349         u4 i;
1350
1351         printf("UTF-HASH:\n");
1352
1353         /* show element of utf-hashtable */
1354
1355         for (i = 0; i < hashtable_utf->size; i++) {
1356                 utf *u = hashtable_utf->ptr[i];
1357
1358                 if (u) {
1359                         printf("SLOT %d: ", (int) i);
1360
1361                         while (u) {
1362                                 printf("'");
1363                                 utf_display_printable_ascii(u);
1364                                 printf("' ");
1365                                 u = u->hashlink;
1366                         }       
1367                         printf("\n");
1368                 }
1369         }
1370
1371         printf("UTF-HASH: %d slots for %d entries\n", 
1372                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1373
1374         if (hashtable_utf->entries == 0)
1375                 return;
1376
1377         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1378
1379         for (i=0;i<CHAIN_LIMIT;i++)
1380                 chain_count[i]=0;
1381
1382         /* count numbers of hashchains according to their length */
1383         for (i=0; i<hashtable_utf->size; i++) {
1384                   
1385                 utf *u = (utf*) hashtable_utf->ptr[i];
1386                 u4 chain_length = 0;
1387
1388                 /* determine chainlength */
1389                 while (u) {
1390                         u = u->hashlink;
1391                         chain_length++;
1392                 }
1393
1394                 /* update sum of all chainlengths */
1395                 sum_chainlength+=chain_length;
1396
1397                 /* determine the maximum length of the chains */
1398                 if (chain_length>max_chainlength)
1399                         max_chainlength = chain_length;
1400
1401                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1402                 if (chain_length>=CHAIN_LIMIT) {
1403                         beyond_limit+=chain_length;
1404                         chain_length=CHAIN_LIMIT-1;
1405                 }
1406
1407                 /* update number of hashchains of current length */
1408                 chain_count[chain_length]++;
1409         }
1410
1411         /* display results */  
1412         for (i=1;i<CHAIN_LIMIT-1;i++) 
1413                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1414           
1415         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1416
1417
1418         printf("max. chainlength:%5d\n",max_chainlength);
1419
1420         /* avg. chainlength = sum of chainlengths / number of chains */
1421         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1422 }
1423 #endif /* !defined(NDEBUG) */
1424
1425
1426 /*
1427  * These are local overrides for various environment variables in Emacs.
1428  * Please do not remove this and leave it at the end of the file, where
1429  * Emacs will automagically detect them.
1430  * ---------------------------------------------------------------------
1431  * Local variables:
1432  * mode: c
1433  * indent-tabs-mode: t
1434  * c-basic-offset: 4
1435  * tab-width: 4
1436  * End:
1437  * vim:noexpandtab:sw=4:ts=4:
1438  */