* added exceptions_new_linkageerror
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 3807 2005-11-26 21:51:11Z edwin $
34
35 */
36
37
38 #include <string.h>
39 #include <assert.h>
40
41 #include "mm/memory.h"
42 #include "vm/exceptions.h"
43 #include "vm/options.h"
44 #include "vm/statistics.h"
45 #include "vm/stringlocal.h"
46 #include "vm/tables.h"
47 #include "vm/utf8.h"
48
49 /* global variables ***********************************************************/
50
51 #if defined(USE_THREADS)
52 static java_objectheader *lock_utf_hashtable;
53 #endif
54
55 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
56
57
58 /* utf-symbols for pointer comparison of frequently used strings **************/
59
60 utf *utf_java_lang_Object;              /* java/lang/Object                   */
61
62 utf *utf_java_lang_Class;
63 utf *utf_java_lang_ClassLoader;
64 utf *utf_java_lang_Cloneable;
65 utf *utf_java_lang_SecurityManager;
66 utf *utf_java_lang_String;
67 utf *utf_java_lang_System;
68 utf *utf_java_lang_ThreadGroup;
69 utf *utf_java_io_Serializable;
70
71 utf *utf_java_lang_Throwable;
72 utf *utf_java_lang_VMThrowable;
73 utf *utf_java_lang_Error;
74 utf *utf_java_lang_NoClassDefFoundError;
75 utf *utf_java_lang_LinkageError;
76 utf *utf_java_lang_NoSuchMethodError;
77 utf *utf_java_lang_OutOfMemoryError;
78
79 utf *utf_java_lang_Exception;
80 utf *utf_java_lang_ClassNotFoundException;
81 utf *utf_java_lang_IllegalArgumentException;
82 utf *utf_java_lang_IllegalMonitorStateException;
83
84 utf *utf_java_lang_NullPointerException;
85
86 utf* utf_java_lang_Void;
87 utf* utf_java_lang_Boolean;
88 utf* utf_java_lang_Byte;
89 utf* utf_java_lang_Character;
90 utf* utf_java_lang_Short;
91 utf* utf_java_lang_Integer;
92 utf* utf_java_lang_Long;
93 utf* utf_java_lang_Float;
94 utf* utf_java_lang_Double;
95
96 utf *utf_java_lang_StackTraceElement;
97 utf *utf_java_lang_reflect_Constructor;
98 utf *utf_java_lang_reflect_Field;
99 utf *utf_java_lang_reflect_Method;
100 utf *utf_java_util_Vector;
101
102 utf *utf_InnerClasses;                  /* InnerClasses                       */
103 utf *utf_ConstantValue;                 /* ConstantValue                      */
104 utf *utf_Code;                          /* Code                               */
105 utf *utf_Exceptions;                    /* Exceptions                         */
106 utf *utf_LineNumberTable;               /* LineNumberTable                    */
107 utf *utf_SourceFile;                    /* SourceFile                         */
108
109 utf *utf_init;                          /* <init>                             */
110 utf *utf_clinit;                        /* <clinit>                           */
111 utf *utf_clone;                         /* clone                              */
112 utf *utf_finalize;                      /* finalize                           */
113 utf *utf_run;                           /* run                                */
114
115 utf *utf_add;                           /* add                                */
116 utf *utf_remove;                        /* remove                             */
117 utf *utf_put;                           /* put                                */
118 utf *utf_get;                           /* get                                */
119 utf *utf_value;                         /* value                              */
120
121 utf *utf_fillInStackTrace;
122 utf *utf_getSystemClassLoader;
123 utf *utf_loadClass;
124 utf *utf_printStackTrace;
125
126 utf *utf_Z;                             /* Z                                  */
127 utf *utf_B;                             /* B                                  */
128 utf *utf_C;                             /* C                                  */
129 utf *utf_S;                             /* S                                  */
130 utf *utf_I;                             /* I                                  */
131 utf *utf_J;                             /* J                                  */
132 utf *utf_F;                             /* F                                  */
133 utf *utf_D;                             /* D                                  */
134
135 utf *utf_void__void;                    /* ()V                                */
136 utf *utf_boolean__void;                 /* (Z)V                               */
137 utf *utf_byte__void;                    /* (B)V                               */
138 utf *utf_char__void;                    /* (C)V                               */
139 utf *utf_short__void;                   /* (S)V                               */
140 utf *utf_int__void;                     /* (I)V                               */
141 utf *utf_long__void;                    /* (J)V                               */
142 utf *utf_float__void;                   /* (F)V                               */
143 utf *utf_double__void;                  /* (D)V                               */
144
145 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
146 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
147 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
148 utf *utf_java_lang_Object__java_lang_Object;
149 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
150 utf *utf_java_lang_String__java_lang_Class;
151 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
152
153 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
154
155 utf *array_packagename;
156
157
158 /* utf_init ********************************************************************
159
160    Initializes the utf8 subsystem.
161
162 *******************************************************************************/
163
164 bool utf8_init(void)
165 {
166 #if defined(USE_THREADS)
167         /* create utf hashtable lock object */
168
169         lock_utf_hashtable = NEW(java_objectheader);
170
171 # if defined(NATIVE_THREADS)
172         initObjectLock(lock_utf_hashtable);
173 # endif
174 #endif
175
176         /* create utf-symbols for pointer comparison of frequently used strings */
177
178         utf_java_lang_Object           = utf_new_char("java/lang/Object");
179
180         utf_java_lang_Class            = utf_new_char("java/lang/Class");
181         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
182         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
183         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
184         utf_java_lang_String           = utf_new_char("java/lang/String");
185         utf_java_lang_System           = utf_new_char("java/lang/System");
186         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
187         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
188
189         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
190         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
191         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
192
193         utf_java_lang_NoClassDefFoundError =
194                 utf_new_char(string_java_lang_NoClassDefFoundError);
195
196         utf_java_lang_LinkageError =
197                 utf_new_char(string_java_lang_LinkageError);
198
199         utf_java_lang_NoSuchMethodError =
200                 utf_new_char(string_java_lang_NoSuchMethodError);
201
202         utf_java_lang_OutOfMemoryError =
203                 utf_new_char(string_java_lang_OutOfMemoryError);
204
205         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
206
207         utf_java_lang_ClassNotFoundException =
208                 utf_new_char(string_java_lang_ClassNotFoundException);
209
210         utf_java_lang_IllegalArgumentException =
211                 utf_new_char(string_java_lang_IllegalArgumentException);
212
213         utf_java_lang_IllegalMonitorStateException =
214                 utf_new_char(string_java_lang_IllegalMonitorStateException);
215
216         utf_java_lang_NullPointerException =
217                 utf_new_char(string_java_lang_NullPointerException);
218
219         utf_java_lang_Void             = utf_new_char("java/lang/Void");
220         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
221         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
222         utf_java_lang_Character        = utf_new_char("java/lang/Character");
223         utf_java_lang_Short            = utf_new_char("java/lang/Short");
224         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
225         utf_java_lang_Long             = utf_new_char("java/lang/Long");
226         utf_java_lang_Float            = utf_new_char("java/lang/Float");
227         utf_java_lang_Double           = utf_new_char("java/lang/Double");
228
229         utf_java_lang_StackTraceElement =
230                 utf_new_char("java/lang/StackTraceElement");
231
232         utf_java_lang_reflect_Constructor =
233                 utf_new_char("java/lang/reflect/Constructor");
234
235         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
236         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
237         utf_java_util_Vector           = utf_new_char("java/util/Vector");
238
239         utf_InnerClasses               = utf_new_char("InnerClasses");
240         utf_ConstantValue              = utf_new_char("ConstantValue");
241         utf_Code                       = utf_new_char("Code");
242         utf_Exceptions                 = utf_new_char("Exceptions");
243         utf_LineNumberTable            = utf_new_char("LineNumberTable");
244         utf_SourceFile                 = utf_new_char("SourceFile");
245
246         utf_init                           = utf_new_char("<init>");
247         utf_clinit                         = utf_new_char("<clinit>");
248         utf_clone                      = utf_new_char("clone");
249         utf_finalize                   = utf_new_char("finalize");
250         utf_run                        = utf_new_char("run");
251
252         utf_add                        = utf_new_char("add");
253         utf_remove                     = utf_new_char("remove");
254         utf_put                        = utf_new_char("put");
255         utf_get                        = utf_new_char("get");
256         utf_value                      = utf_new_char("value");
257
258         utf_printStackTrace            = utf_new_char("printStackTrace");
259         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
260         utf_loadClass                  = utf_new_char("loadClass");
261         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
262
263         utf_Z                          = utf_new_char("Z");
264         utf_B                          = utf_new_char("B");
265         utf_C                          = utf_new_char("C");
266         utf_S                          = utf_new_char("S");
267         utf_I                          = utf_new_char("I");
268         utf_J                          = utf_new_char("J");
269         utf_F                          = utf_new_char("F");
270         utf_D                          = utf_new_char("D");
271
272         utf_void__void                 = utf_new_char("()V");
273         utf_boolean__void              = utf_new_char("(Z)V");
274         utf_byte__void                 = utf_new_char("(B)V");
275         utf_char__void                 = utf_new_char("(C)V");
276         utf_short__void                = utf_new_char("(S)V");
277         utf_int__void                  = utf_new_char("(I)V");
278         utf_long__void                 = utf_new_char("(J)V");
279         utf_float__void                = utf_new_char("(F)V");
280         utf_double__void               = utf_new_char("(D)V");
281         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
282         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
283
284         utf_void__java_lang_ClassLoader =
285                 utf_new_char("()Ljava/lang/ClassLoader;");
286
287         utf_java_lang_Object__java_lang_Object =
288                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
289
290         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
291
292         utf_java_lang_String__java_lang_Class =
293                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
294
295         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
296
297         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
298
299         array_packagename              = utf_new_char("\t<the array package>");
300
301         /* everything's ok */
302
303         return true;
304 }
305
306
307 /* utf_hashkey *****************************************************************
308
309    The hashkey is computed from the utf-text by using up to 8
310    characters.  For utf-symbols longer than 15 characters 3 characters
311    are taken from the beginning and the end, 2 characters are taken
312    from the middle.
313
314 *******************************************************************************/
315
316 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
317 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
318
319 u4 utf_hashkey(const char *text, u4 length)
320 {
321         const char *start_pos = text;       /* pointer to utf text                */
322         u4 a;
323
324         switch (length) {
325         case 0: /* empty string */
326                 return 0;
327
328         case 1: return fbs(0);
329         case 2: return fbs(0) ^ nbs(3);
330         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
331         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
332         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
333         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
334         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
335         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
336
337         case 9:
338                 a = fbs(0);
339                 a ^= nbs(1);
340                 a ^= nbs(2);
341                 text++;
342                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
343
344         case 10:
345                 a = fbs(0);
346                 text++;
347                 a ^= nbs(2);
348                 a ^= nbs(3);
349                 a ^= nbs(4);
350                 text++;
351                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
352
353         case 11:
354                 a = fbs(0);
355                 text++;
356                 a ^= nbs(2);
357                 a ^= nbs(3);
358                 a ^= nbs(4);
359                 text++;
360                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
361
362         case 12:
363                 a = fbs(0);
364                 text += 2;
365                 a ^= nbs(2);
366                 a ^= nbs(3);
367                 text++;
368                 a ^= nbs(5);
369                 a ^= nbs(6);
370                 a ^= nbs(7);
371                 text++;
372                 return a ^ nbs(9) ^ nbs(10);
373
374         case 13:
375                 a = fbs(0);
376                 a ^= nbs(1);
377                 text++;
378                 a ^= nbs(3);
379                 a ^= nbs(4);
380                 text += 2;      
381                 a ^= nbs(7);
382                 a ^= nbs(8);
383                 text += 2;
384                 return a ^ nbs(9) ^ nbs(10);
385
386         case 14:
387                 a = fbs(0);
388                 text += 2;      
389                 a ^= nbs(3);
390                 a ^= nbs(4);
391                 text += 2;      
392                 a ^= nbs(7);
393                 a ^= nbs(8);
394                 text += 2;
395                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
396
397         case 15:
398                 a = fbs(0);
399                 text += 2;      
400                 a ^= nbs(3);
401                 a ^= nbs(4);
402                 text += 2;      
403                 a ^= nbs(7);
404                 a ^= nbs(8);
405                 text += 2;
406                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
407
408         default:  /* 3 characters from beginning */
409                 a = fbs(0);
410                 text += 2;
411                 a ^= nbs(3);
412                 a ^= nbs(4);
413
414                 /* 2 characters from middle */
415                 text = start_pos + (length / 2);
416                 a ^= fbs(5);
417                 text += 2;
418                 a ^= nbs(6);    
419
420                 /* 3 characters from end */
421                 text = start_pos + length - 4;
422
423                 a ^= fbs(7);
424                 text++;
425
426                 return a ^ nbs(10) ^ nbs(11);
427     }
428 }
429
430
431 /* utf_hashkey *****************************************************************
432
433    Compute the hashkey of a unicode string.
434
435 *******************************************************************************/
436
437 u4 unicode_hashkey(u2 *text, u2 len)
438 {
439         return utf_hashkey((char *) text, len);
440 }
441
442
443 /* utf_new *********************************************************************
444
445    Creates a new utf-symbol, the text of the symbol is passed as a
446    u1-array. The function searches the utf-hashtable for a utf-symbol
447    with this text. On success the element returned, otherwise a new
448    hashtable element is created.
449
450    If the number of entries in the hashtable exceeds twice the size of
451    the hashtable slots a reorganization of the hashtable is done and
452    the utf symbols are copied to a new hashtable with doubled size.
453
454 *******************************************************************************/
455
456 utf *utf_new(const char *text, u2 length)
457 {
458         u4 key;                             /* hashkey computed from utf-text     */
459         u4 slot;                            /* slot in hashtable                  */
460         utf *u;                             /* hashtable element                  */
461         u2 i;
462
463         /* XXX REMOVE ME! after testing of course ;-) */
464         static int running = 0;
465         /* XXX REMOVE ME! */
466
467 #if defined(USE_THREADS)
468         builtin_monitorenter(lock_utf_hashtable);
469 #endif
470
471         /* XXX REMOVE ME! after testing of course ;-) */
472         assert(running == 0);
473         running = 1;
474         /* XXX REMOVE ME! */
475
476 #ifdef STATISTICS
477         if (opt_stat)
478                 count_utf_new++;
479 #endif
480
481         key  = utf_hashkey(text, length);
482         slot = key & (utf_hash.size - 1);
483         u    = utf_hash.ptr[slot];
484
485         /* search external hash chain for utf-symbol */
486
487         while (u) {
488                 if (u->blength == length) {
489                         /* compare text of hashtable elements */
490
491                         for (i = 0; i < length; i++)
492                                 if (text[i] != u->text[i])
493                                         goto nomatch;
494                         
495 #if defined(STATISTICS)
496                         if (opt_stat)
497                                 count_utf_new_found++;
498 #endif
499
500                         /* symbol found in hashtable */
501
502                         /* XXX REMOVE ME! */
503                         running = 0;
504                         /* XXX REMOVE ME! */
505
506 #if defined(USE_THREADS)
507                         builtin_monitorexit(lock_utf_hashtable);
508 #endif
509
510                         return u;
511                 }
512
513         nomatch:
514                 u = u->hashlink; /* next element in external chain */
515         }
516
517 #if defined(STATISTICS)
518         if (opt_stat)
519                 count_utf_len += sizeof(utf) + length + 1;
520 #endif
521
522         /* location in hashtable found, create new utf element */
523         u = NEW(utf);
524         u->blength  = length;               /* length in bytes of utfstring       */
525         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
526         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
527         memcpy(u->text, text, length);      /* copy utf-text                      */
528         u->text[length] = '\0';
529         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
530
531         utf_hash.entries++;                 /* update number of entries           */
532
533         if (utf_hash.entries > (utf_hash.size * 2)) {
534
535         /* reorganization of hashtable, average length of 
536            the external chains is approx. 2                */  
537
538                 u4 i;
539                 utf *u;
540                 hashtable newhash; /* the new hashtable */
541
542                 /* create new hashtable, double the size */
543                 init_hashtable(&newhash, utf_hash.size * 2);
544                 newhash.entries = utf_hash.entries;
545
546 #ifdef STATISTICS
547                 if (opt_stat)
548                         count_utf_len += sizeof(utf*) * utf_hash.size;
549 #endif
550
551                 /* transfer elements to new hashtable */
552                 for (i = 0; i < utf_hash.size; i++) {
553                         u = (utf *) utf_hash.ptr[i];
554                         while (u) {
555                                 utf *nextu = u->hashlink;
556                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
557                                                 
558                                 u->hashlink = (utf *) newhash.ptr[slot];
559                                 newhash.ptr[slot] = u;
560
561                                 /* follow link in external hash chain */
562                                 u = nextu;
563                         }
564                 }
565         
566                 /* dispose old table */
567                 MFREE(utf_hash.ptr, void*, utf_hash.size);
568                 utf_hash = newhash;
569         }
570
571         /* XXX REMOVE ME! */
572         running = 0;
573         /* XXX REMOVE ME! */
574
575 #if defined(USE_THREADS)
576         builtin_monitorexit(lock_utf_hashtable);
577 #endif
578
579         return u;
580 }
581
582
583 /* utf_new_u2 ******************************************************************
584
585    Make utf symbol from u2 array, if isclassname is true '.' is
586    replaced by '/'.
587
588 *******************************************************************************/
589
590 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
591 {
592         char *buffer;                   /* memory buffer for  unicode characters  */
593         char *pos;                      /* pointer to current position in buffer  */
594         u4 left;                        /* unicode characters left                */
595         u4 buflength;                   /* utf length in bytes of the u2 array    */
596         utf *result;                    /* resulting utf-string                   */
597         int i;          
598
599         /* determine utf length in bytes and allocate memory */
600
601         buflength = u2_utflength(unicode_pos, unicode_length); 
602         buffer    = MNEW(char, buflength);
603  
604         left = buflength;
605         pos  = buffer;
606
607         for (i = 0; i++ < unicode_length; unicode_pos++) {
608                 /* next unicode character */
609                 u2 c = *unicode_pos;
610                 
611                 if ((c != 0) && (c < 0x80)) {
612                         /* 1 character */       
613                         left--;
614                 if ((int) left < 0) break;
615                         /* convert classname */
616                         if (isclassname && c == '.')
617                                 *pos++ = '/';
618                         else
619                                 *pos++ = (char) c;
620
621                 } else if (c < 0x800) {             
622                         /* 2 characters */                              
623                 unsigned char high = c >> 6;
624                 unsigned char low  = c & 0x3F;
625                         left = left - 2;
626                 if ((int) left < 0) break;
627                 *pos++ = high | 0xC0; 
628                 *pos++ = low  | 0x80;     
629
630                 } else {         
631                 /* 3 characters */                              
632                 char low  = c & 0x3f;
633                 char mid  = (c >> 6) & 0x3F;
634                 char high = c >> 12;
635                         left = left - 3;
636                 if ((int) left < 0) break;
637                 *pos++ = high | 0xE0; 
638                 *pos++ = mid  | 0x80;  
639                 *pos++ = low  | 0x80;   
640                 }
641         }
642         
643         /* insert utf-string into symbol-table */
644         result = utf_new(buffer,buflength);
645
646         MFREE(buffer, char, buflength);
647
648         return result;
649 }
650
651
652 /* utf_new_char ****************************************************************
653
654    Creates a new utf symbol, the text for this symbol is passed as a
655    c-string ( = char* ).
656
657 *******************************************************************************/
658
659 utf *utf_new_char(const char *text)
660 {
661         return utf_new(text, strlen(text));
662 }
663
664
665 /* utf_new_char_classname ******************************************************
666
667    Creates a new utf symbol, the text for this symbol is passed as a
668    c-string ( = char* ) "." characters are going to be replaced by
669    "/". Since the above function is used often, this is a separte
670    function, instead of an if.
671
672 *******************************************************************************/
673
674 utf *utf_new_char_classname(const char *text)
675 {
676         if (strchr(text, '.')) {
677                 char *txt = strdup(text);
678                 char *end = txt + strlen(txt);
679                 char *c;
680                 utf *tmpRes;
681
682                 for (c = txt; c < end; c++)
683                         if (*c == '.') *c = '/';
684
685                 tmpRes = utf_new(txt, strlen(txt));
686                 FREE(txt, 0);
687
688                 return tmpRes;
689
690         } else
691                 return utf_new(text, strlen(text));
692 }
693
694
695 /* utf_nextu2 ******************************************************************
696
697    Read the next unicode character from the utf string and increment
698    the utf-string pointer accordingly.
699
700 *******************************************************************************/
701
702 u2 utf_nextu2(char **utf_ptr)
703 {
704     /* uncompressed unicode character */
705     u2 unicode_char = 0;
706     /* current position in utf text */  
707     unsigned char *utf = (unsigned char *) (*utf_ptr);
708     /* bytes representing the unicode character */
709     unsigned char ch1, ch2, ch3;
710     /* number of bytes used to represent the unicode character */
711     int len = 0;
712         
713     switch ((ch1 = utf[0]) >> 4) {
714         default: /* 1 byte */
715                 (*utf_ptr)++;
716                 return (u2) ch1;
717         case 0xC: 
718         case 0xD: /* 2 bytes */
719                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
720                         unsigned char high = ch1 & 0x1F;
721                         unsigned char low  = ch2 & 0x3F;
722                         unicode_char = (high << 6) + low;
723                         len = 2;
724                 }
725                 break;
726
727         case 0xE: /* 2 or 3 bytes */
728                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
729                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
730                                 unsigned char low  = ch3 & 0x3f;
731                                 unsigned char mid  = ch2 & 0x3f;
732                                 unsigned char high = ch1 & 0x0f;
733                                 unicode_char = (((high << 6) + mid) << 6) + low;
734                                 len = 3;
735                         } else
736                                 len = 2;                                           
737                 }
738                 break;
739     }
740
741     /* update position in utf-text */
742     *utf_ptr = (char *) (utf + len);
743
744     return unicode_char;
745 }
746
747
748 /* utf_strlen ******************************************************************
749
750    Determine number of unicode characters in the utf string.
751
752 *******************************************************************************/
753
754 u4 utf_strlen(utf *u)
755 {
756         char *endpos;                       /* points behind utf string           */
757         char *utf_ptr;                      /* current position in utf text       */
758         u4 len = 0;                         /* number of unicode characters       */
759
760         if (!u) {
761                 *exceptionptr = new_nullpointerexception();
762                 return 0;
763         }
764
765         endpos = UTF_END(u);
766         utf_ptr = u->text;
767
768         while (utf_ptr < endpos) {
769                 len++;
770                 /* next unicode character */
771                 utf_nextu2(&utf_ptr);
772         }
773
774         if (utf_ptr != endpos)
775                 /* string ended abruptly */
776                 throw_cacao_exception_exit(string_java_lang_InternalError,
777                                                                    "Illegal utf8 string");
778
779         return len;
780 }
781
782
783 /* u2_utflength ****************************************************************
784
785    Returns the utf length in bytes of a u2 array.
786
787 *******************************************************************************/
788
789 u4 u2_utflength(u2 *text, u4 u2_length)
790 {
791         u4 result_len = 0;                  /* utf length in bytes                */
792         u2 ch;                              /* current unicode character          */
793         u4 len;
794         
795         for (len = 0; len < u2_length; len++) {
796                 /* next unicode character */
797                 ch = *text++;
798           
799                 /* determine bytes required to store unicode character as utf */
800                 if (ch && (ch < 0x80)) 
801                         result_len++;
802                 else if (ch < 0x800)
803                         result_len += 2;        
804                 else 
805                         result_len += 3;        
806         }
807
808     return result_len;
809 }
810
811
812 /* utf_display *****************************************************************
813
814    Write utf symbol to stdout (for debugging purposes).
815
816 *******************************************************************************/
817
818 void utf_display(utf *u)
819 {
820         char *endpos;                       /* points behind utf string           */
821         char *utf_ptr;                      /* current position in utf text       */
822
823         if (!u) {
824                 printf("NULL");
825                 fflush(stdout);
826                 return;
827         }
828
829         endpos = UTF_END(u);
830         utf_ptr = u->text;
831
832         while (utf_ptr < endpos) {
833                 /* read next unicode character */                
834                 u2 c = utf_nextu2(&utf_ptr);
835                 if (c >= 32 && c <= 127) printf("%c", c);
836                 else printf("?");
837         }
838
839         fflush(stdout);
840 }
841
842
843 /* utf_display_classname *******************************************************
844
845    Write utf symbol to stdout with `/' converted to `.' (for debugging
846    purposes).
847
848 *******************************************************************************/
849
850 void utf_display_classname(utf *u)
851 {
852         char *endpos;                       /* points behind utf string           */
853         char *utf_ptr;                      /* current position in utf text       */
854
855         if (!u) {
856                 printf("NULL");
857                 fflush(stdout);
858                 return;
859         }
860
861         endpos = UTF_END(u);
862         utf_ptr = u->text;
863
864         while (utf_ptr < endpos) {
865                 /* read next unicode character */                
866                 u2 c = utf_nextu2(&utf_ptr);
867                 if (c == '/') c = '.';
868                 if (c >= 32 && c <= 127) printf("%c", c);
869                 else printf("?");
870         }
871
872         fflush(stdout);
873 }
874
875
876 /* utf_sprint ******************************************************************
877         
878    Write utf symbol into c-string (for debugging purposes).
879
880 *******************************************************************************/
881
882 void utf_sprint(char *buffer, utf *u)
883 {
884         char *endpos;                       /* points behind utf string           */
885         char *utf_ptr;                      /* current position in utf text       */
886         u2 pos = 0;                         /* position in c-string               */
887
888         if (!u) {
889                 strcpy(buffer, "NULL");
890                 return;
891         }
892
893         endpos = UTF_END(u);
894         utf_ptr = u->text;
895
896         while (utf_ptr < endpos) 
897                 /* copy next unicode character */       
898                 buffer[pos++] = utf_nextu2(&utf_ptr);
899
900         /* terminate string */
901         buffer[pos] = '\0';
902 }
903
904
905 /* utf_sprint_classname ********************************************************
906         
907    Write utf symbol into c-string with `/' converted to `.' (for debugging
908    purposes).
909
910 *******************************************************************************/
911
912 void utf_sprint_classname(char *buffer, utf *u)
913 {
914         char *endpos;                       /* points behind utf string           */
915         char *utf_ptr;                      /* current position in utf text       */
916         u2 pos = 0;                         /* position in c-string               */
917
918         if (!u) {
919                 strcpy(buffer, "NULL");
920                 return;
921         }
922
923         endpos = UTF_END(u);
924         utf_ptr = u->text;
925
926         while (utf_ptr < endpos) {
927                 /* copy next unicode character */       
928                 u2 c = utf_nextu2(&utf_ptr);
929                 if (c == '/') c = '.';
930                 buffer[pos++] = c;
931         }
932
933         /* terminate string */
934         buffer[pos] = '\0';
935 }
936
937
938 /* utf_strcat ******************************************************************
939         
940    Like libc strcat, but uses an utf8 string.
941
942 *******************************************************************************/
943
944 void utf_strcat(char *buffer, utf *u)
945 {
946         utf_sprint(buffer + strlen(buffer), u);
947 }
948
949
950 /* utf_strcat_classname ********************************************************
951         
952    Like libc strcat, but uses an utf8 string.
953
954 *******************************************************************************/
955
956 void utf_strcat_classname(char *buffer, utf *u)
957 {
958         utf_sprint_classname(buffer + strlen(buffer), u);
959 }
960
961
962 /* utf_fprint ******************************************************************
963         
964    Write utf symbol into file.
965
966 *******************************************************************************/
967
968 void utf_fprint(FILE *file, utf *u)
969 {
970         char *endpos;                       /* points behind utf string           */
971         char *utf_ptr;                      /* current position in utf text       */
972
973         if (!u)
974                 return;
975
976         endpos = UTF_END(u);
977         utf_ptr = u->text;
978
979         while (utf_ptr < endpos) { 
980                 /* read next unicode character */                
981                 u2 c = utf_nextu2(&utf_ptr);                            
982
983                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
984                 else fprintf(file, "?");
985         }
986 }
987
988
989 /* utf_fprint_classname ********************************************************
990         
991    Write utf symbol into file with `/' converted to `.'.
992
993 *******************************************************************************/
994
995 void utf_fprint_classname(FILE *file, utf *u)
996 {
997         char *endpos;                       /* points behind utf string           */
998         char *utf_ptr;                      /* current position in utf text       */
999
1000     if (!u)
1001                 return;
1002
1003         endpos = UTF_END(u);
1004         utf_ptr = u->text;
1005
1006         while (utf_ptr < endpos) { 
1007                 /* read next unicode character */                
1008                 u2 c = utf_nextu2(&utf_ptr);                            
1009                 if (c == '/') c = '.';
1010
1011                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1012                 else fprintf(file, "?");
1013         }
1014 }
1015
1016
1017 /* is_valid_utf ****************************************************************
1018
1019    Return true if the given string is a valid UTF-8 string.
1020
1021    utf_ptr...points to first character
1022    end_pos...points after last character
1023
1024 *******************************************************************************/
1025
1026 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1027
1028 bool is_valid_utf(char *utf_ptr, char *end_pos)
1029 {
1030         int bytes;
1031         int len,i;
1032         char c;
1033         unsigned long v;
1034
1035         if (end_pos < utf_ptr) return false;
1036         bytes = end_pos - utf_ptr;
1037         while (bytes--) {
1038                 c = *utf_ptr++;
1039
1040                 if (!c) return false;                     /* 0x00 is not allowed */
1041                 if ((c & 0x80) == 0) continue;            /* ASCII */
1042
1043                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1044                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1045                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1046                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1047                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1048                 else return false;                        /* invalid leading byte */
1049
1050                 if (len > 2) return false;                /* Java limitation */
1051
1052                 v = (unsigned long)c & (0x3f >> len);
1053                 
1054                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1055
1056                 for (i = len; i--; ) {
1057                         c = *utf_ptr++;
1058                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1059                                 return false;
1060                         v = (v << 6) | (c & 0x3f);
1061                 }
1062
1063                 if (v == 0) {
1064                         if (len != 1) return false;           /* Java special */
1065
1066                 } else {
1067                         /* Sun Java seems to allow overlong UTF-8 encodings */
1068                         
1069                         /* if (v < min_codepoint[len]) */
1070                                 /* XXX throw exception? */
1071                 }
1072
1073                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1074                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1075
1076                 /* even these seem to be allowed */
1077                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1078         }
1079
1080         return true;
1081 }
1082
1083
1084 /* is_valid_name ***************************************************************
1085
1086    Return true if the given string may be used as a class/field/method
1087    name. (Currently this only disallows empty strings and control
1088    characters.)
1089
1090    NOTE: The string is assumed to have passed is_valid_utf!
1091
1092    utf_ptr...points to first character
1093    end_pos...points after last character
1094
1095 *******************************************************************************/
1096
1097 bool is_valid_name(char *utf_ptr, char *end_pos)
1098 {
1099         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1100
1101         while (utf_ptr < end_pos) {
1102                 unsigned char c = *utf_ptr++;
1103
1104                 if (c < 0x20) return false; /* disallow control characters */
1105                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1106                         return false;
1107         }
1108
1109         return true;
1110 }
1111
1112 bool is_valid_name_utf(utf *u)
1113 {
1114         return is_valid_name(u->text, UTF_END(u));
1115 }
1116
1117
1118 /* utf_show ********************************************************************
1119
1120    Writes the utf symbols in the utfhash to stdout and displays the
1121    number of external hash chains grouped according to the chainlength
1122    (for debugging purposes).
1123
1124 *******************************************************************************/
1125
1126 void utf_show(void)
1127 {
1128
1129 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1130
1131         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1132         u4 max_chainlength = 0;      /* maximum length of the chains */
1133         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1134         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1135         u4 i;
1136
1137         printf ("UTF-HASH:\n");
1138
1139         /* show element of utf-hashtable */
1140         for (i=0; i<utf_hash.size; i++) {
1141                 utf *u = utf_hash.ptr[i];
1142                 if (u) {
1143                         printf ("SLOT %d: ", (int) i);
1144                         while (u) {
1145                                 printf ("'");
1146                                 utf_display (u);
1147                                 printf ("' ");
1148                                 u = u->hashlink;
1149                         }       
1150                         printf ("\n");
1151                 }
1152                 
1153         }
1154
1155         printf ("UTF-HASH: %d slots for %d entries\n", 
1156                         (int) utf_hash.size, (int) utf_hash.entries );
1157
1158
1159         if (utf_hash.entries == 0)
1160                 return;
1161
1162         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1163
1164         for (i=0;i<CHAIN_LIMIT;i++)
1165                 chain_count[i]=0;
1166
1167         /* count numbers of hashchains according to their length */
1168         for (i=0; i<utf_hash.size; i++) {
1169                   
1170                 utf *u = (utf*) utf_hash.ptr[i];
1171                 u4 chain_length = 0;
1172
1173                 /* determine chainlength */
1174                 while (u) {
1175                         u = u->hashlink;
1176                         chain_length++;
1177                 }
1178
1179                 /* update sum of all chainlengths */
1180                 sum_chainlength+=chain_length;
1181
1182                 /* determine the maximum length of the chains */
1183                 if (chain_length>max_chainlength)
1184                         max_chainlength = chain_length;
1185
1186                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1187                 if (chain_length>=CHAIN_LIMIT) {
1188                         beyond_limit+=chain_length;
1189                         chain_length=CHAIN_LIMIT-1;
1190                 }
1191
1192                 /* update number of hashchains of current length */
1193                 chain_count[chain_length]++;
1194         }
1195
1196         /* display results */  
1197         for (i=1;i<CHAIN_LIMIT-1;i++) 
1198                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1199           
1200         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1201
1202
1203         printf("max. chainlength:%5d\n",max_chainlength);
1204
1205         /* avg. chainlength = sum of chainlengths / number of chains */
1206         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1207 }
1208
1209
1210 /*
1211  * These are local overrides for various environment variables in Emacs.
1212  * Please do not remove this and leave it at the end of the file, where
1213  * Emacs will automagically detect them.
1214  * ---------------------------------------------------------------------
1215  * Local variables:
1216  * mode: c
1217  * indent-tabs-mode: t
1218  * c-basic-offset: 4
1219  * tab-width: 4
1220  * End:
1221  */