* Replaced ENABLE_DEBUG with NDEBUG.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 3940 2005-12-11 01:06:16Z twisti $
34
35 */
36
37
38 #include <string.h>
39 #include <assert.h>
40
41 #include "config.h"
42 #include "vm/types.h"
43
44 #include "mm/memory.h"
45 #include "vm/exceptions.h"
46 #include "vm/hashtable.h"
47 #include "vm/options.h"
48 #include "vm/statistics.h"
49 #include "vm/stringlocal.h"
50 #include "vm/utf8.h"
51
52 /* global variables ***********************************************************/
53
54 /* hashsize must be power of 2 */
55
56 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
57
58 hashtable hashtable_utf;                /* hashtable for utf8-symbols         */
59
60 #if defined(USE_THREADS)
61 static java_objectheader *lock_hashtable_utf;
62 #endif
63
64
65 /* utf-symbols for pointer comparison of frequently used strings **************/
66
67 utf *utf_java_lang_Object;              /* java/lang/Object                   */
68
69 utf *utf_java_lang_Class;
70 utf *utf_java_lang_ClassLoader;
71 utf *utf_java_lang_Cloneable;
72 utf *utf_java_lang_SecurityManager;
73 utf *utf_java_lang_String;
74 utf *utf_java_lang_System;
75 utf *utf_java_lang_ThreadGroup;
76 utf *utf_java_io_Serializable;
77
78 utf *utf_java_lang_Throwable;
79 utf *utf_java_lang_VMThrowable;
80 utf *utf_java_lang_Error;
81 utf *utf_java_lang_NoClassDefFoundError;
82 utf *utf_java_lang_LinkageError;
83 utf *utf_java_lang_NoSuchMethodError;
84 utf *utf_java_lang_OutOfMemoryError;
85
86 utf *utf_java_lang_Exception;
87 utf *utf_java_lang_ClassNotFoundException;
88 utf *utf_java_lang_IllegalArgumentException;
89 utf *utf_java_lang_IllegalMonitorStateException;
90
91 utf *utf_java_lang_NullPointerException;
92
93 utf* utf_java_lang_Void;
94 utf* utf_java_lang_Boolean;
95 utf* utf_java_lang_Byte;
96 utf* utf_java_lang_Character;
97 utf* utf_java_lang_Short;
98 utf* utf_java_lang_Integer;
99 utf* utf_java_lang_Long;
100 utf* utf_java_lang_Float;
101 utf* utf_java_lang_Double;
102
103 utf *utf_java_lang_StackTraceElement;
104 utf *utf_java_lang_reflect_Constructor;
105 utf *utf_java_lang_reflect_Field;
106 utf *utf_java_lang_reflect_Method;
107 utf *utf_java_util_Vector;
108
109 utf *utf_InnerClasses;                  /* InnerClasses                       */
110 utf *utf_ConstantValue;                 /* ConstantValue                      */
111 utf *utf_Code;                          /* Code                               */
112 utf *utf_Exceptions;                    /* Exceptions                         */
113 utf *utf_LineNumberTable;               /* LineNumberTable                    */
114 utf *utf_SourceFile;                    /* SourceFile                         */
115
116 utf *utf_init;                          /* <init>                             */
117 utf *utf_clinit;                        /* <clinit>                           */
118 utf *utf_clone;                         /* clone                              */
119 utf *utf_finalize;                      /* finalize                           */
120 utf *utf_run;                           /* run                                */
121
122 utf *utf_add;                           /* add                                */
123 utf *utf_remove;                        /* remove                             */
124 utf *utf_put;                           /* put                                */
125 utf *utf_get;                           /* get                                */
126 utf *utf_value;                         /* value                              */
127
128 utf *utf_fillInStackTrace;
129 utf *utf_getSystemClassLoader;
130 utf *utf_loadClass;
131 utf *utf_printStackTrace;
132
133 utf *utf_Z;                             /* Z                                  */
134 utf *utf_B;                             /* B                                  */
135 utf *utf_C;                             /* C                                  */
136 utf *utf_S;                             /* S                                  */
137 utf *utf_I;                             /* I                                  */
138 utf *utf_J;                             /* J                                  */
139 utf *utf_F;                             /* F                                  */
140 utf *utf_D;                             /* D                                  */
141
142 utf *utf_void__void;                    /* ()V                                */
143 utf *utf_boolean__void;                 /* (Z)V                               */
144 utf *utf_byte__void;                    /* (B)V                               */
145 utf *utf_char__void;                    /* (C)V                               */
146 utf *utf_short__void;                   /* (S)V                               */
147 utf *utf_int__void;                     /* (I)V                               */
148 utf *utf_long__void;                    /* (J)V                               */
149 utf *utf_float__void;                   /* (F)V                               */
150 utf *utf_double__void;                  /* (D)V                               */
151
152 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
153 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
154 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
155 utf *utf_java_lang_Object__java_lang_Object;
156 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
157 utf *utf_java_lang_String__java_lang_Class;
158 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
159
160 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
161
162 utf *array_packagename;
163
164
165 /* utf_init ********************************************************************
166
167    Initializes the utf8 subsystem.
168
169 *******************************************************************************/
170
171 bool utf8_init(void)
172 {
173         /* create utf8 hashtable */
174
175         hashtable_create(&hashtable_utf, HASHTABLE_UTF_SIZE);
176
177 #if defined(STATISTICS)
178         if (opt_stat)
179                 count_utf_len += sizeof(utf*) * hashtable_utf.size;
180 #endif
181
182 #if defined(USE_THREADS)
183         /* create utf hashtable lock object */
184
185         lock_hashtable_utf = NEW(java_objectheader);
186
187 # if defined(NATIVE_THREADS)
188         initObjectLock(lock_hashtable_utf);
189 # endif
190 #endif
191
192         /* create utf-symbols for pointer comparison of frequently used strings */
193
194         utf_java_lang_Object           = utf_new_char("java/lang/Object");
195
196         utf_java_lang_Class            = utf_new_char("java/lang/Class");
197         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
198         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
199         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
200         utf_java_lang_String           = utf_new_char("java/lang/String");
201         utf_java_lang_System           = utf_new_char("java/lang/System");
202         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
203         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
204
205         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
206         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
207         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
208
209         utf_java_lang_NoClassDefFoundError =
210                 utf_new_char(string_java_lang_NoClassDefFoundError);
211
212         utf_java_lang_LinkageError =
213                 utf_new_char(string_java_lang_LinkageError);
214
215         utf_java_lang_NoSuchMethodError =
216                 utf_new_char(string_java_lang_NoSuchMethodError);
217
218         utf_java_lang_OutOfMemoryError =
219                 utf_new_char(string_java_lang_OutOfMemoryError);
220
221         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
222
223         utf_java_lang_ClassNotFoundException =
224                 utf_new_char(string_java_lang_ClassNotFoundException);
225
226         utf_java_lang_IllegalArgumentException =
227                 utf_new_char(string_java_lang_IllegalArgumentException);
228
229         utf_java_lang_IllegalMonitorStateException =
230                 utf_new_char(string_java_lang_IllegalMonitorStateException);
231
232         utf_java_lang_NullPointerException =
233                 utf_new_char(string_java_lang_NullPointerException);
234
235         utf_java_lang_Void             = utf_new_char("java/lang/Void");
236         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
237         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
238         utf_java_lang_Character        = utf_new_char("java/lang/Character");
239         utf_java_lang_Short            = utf_new_char("java/lang/Short");
240         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
241         utf_java_lang_Long             = utf_new_char("java/lang/Long");
242         utf_java_lang_Float            = utf_new_char("java/lang/Float");
243         utf_java_lang_Double           = utf_new_char("java/lang/Double");
244
245         utf_java_lang_StackTraceElement =
246                 utf_new_char("java/lang/StackTraceElement");
247
248         utf_java_lang_reflect_Constructor =
249                 utf_new_char("java/lang/reflect/Constructor");
250
251         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
252         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
253         utf_java_util_Vector           = utf_new_char("java/util/Vector");
254
255         utf_InnerClasses               = utf_new_char("InnerClasses");
256         utf_ConstantValue              = utf_new_char("ConstantValue");
257         utf_Code                       = utf_new_char("Code");
258         utf_Exceptions                 = utf_new_char("Exceptions");
259         utf_LineNumberTable            = utf_new_char("LineNumberTable");
260         utf_SourceFile                 = utf_new_char("SourceFile");
261
262         utf_init                           = utf_new_char("<init>");
263         utf_clinit                         = utf_new_char("<clinit>");
264         utf_clone                      = utf_new_char("clone");
265         utf_finalize                   = utf_new_char("finalize");
266         utf_run                        = utf_new_char("run");
267
268         utf_add                        = utf_new_char("add");
269         utf_remove                     = utf_new_char("remove");
270         utf_put                        = utf_new_char("put");
271         utf_get                        = utf_new_char("get");
272         utf_value                      = utf_new_char("value");
273
274         utf_printStackTrace            = utf_new_char("printStackTrace");
275         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
276         utf_loadClass                  = utf_new_char("loadClass");
277         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
278
279         utf_Z                          = utf_new_char("Z");
280         utf_B                          = utf_new_char("B");
281         utf_C                          = utf_new_char("C");
282         utf_S                          = utf_new_char("S");
283         utf_I                          = utf_new_char("I");
284         utf_J                          = utf_new_char("J");
285         utf_F                          = utf_new_char("F");
286         utf_D                          = utf_new_char("D");
287
288         utf_void__void                 = utf_new_char("()V");
289         utf_boolean__void              = utf_new_char("(Z)V");
290         utf_byte__void                 = utf_new_char("(B)V");
291         utf_char__void                 = utf_new_char("(C)V");
292         utf_short__void                = utf_new_char("(S)V");
293         utf_int__void                  = utf_new_char("(I)V");
294         utf_long__void                 = utf_new_char("(J)V");
295         utf_float__void                = utf_new_char("(F)V");
296         utf_double__void               = utf_new_char("(D)V");
297         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
298         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
299
300         utf_void__java_lang_ClassLoader =
301                 utf_new_char("()Ljava/lang/ClassLoader;");
302
303         utf_java_lang_Object__java_lang_Object =
304                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
305
306         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
307
308         utf_java_lang_String__java_lang_Class =
309                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
310
311         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
312
313         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
314
315         array_packagename              = utf_new_char("\t<the array package>");
316
317         /* everything's ok */
318
319         return true;
320 }
321
322
323 /* utf_hashkey *****************************************************************
324
325    The hashkey is computed from the utf-text by using up to 8
326    characters.  For utf-symbols longer than 15 characters 3 characters
327    are taken from the beginning and the end, 2 characters are taken
328    from the middle.
329
330 *******************************************************************************/
331
332 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
333 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
334
335 u4 utf_hashkey(const char *text, u4 length)
336 {
337         const char *start_pos = text;       /* pointer to utf text                */
338         u4 a;
339
340         switch (length) {
341         case 0: /* empty string */
342                 return 0;
343
344         case 1: return fbs(0);
345         case 2: return fbs(0) ^ nbs(3);
346         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
347         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
348         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
349         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
350         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
351         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
352
353         case 9:
354                 a = fbs(0);
355                 a ^= nbs(1);
356                 a ^= nbs(2);
357                 text++;
358                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
359
360         case 10:
361                 a = fbs(0);
362                 text++;
363                 a ^= nbs(2);
364                 a ^= nbs(3);
365                 a ^= nbs(4);
366                 text++;
367                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
368
369         case 11:
370                 a = fbs(0);
371                 text++;
372                 a ^= nbs(2);
373                 a ^= nbs(3);
374                 a ^= nbs(4);
375                 text++;
376                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
377
378         case 12:
379                 a = fbs(0);
380                 text += 2;
381                 a ^= nbs(2);
382                 a ^= nbs(3);
383                 text++;
384                 a ^= nbs(5);
385                 a ^= nbs(6);
386                 a ^= nbs(7);
387                 text++;
388                 return a ^ nbs(9) ^ nbs(10);
389
390         case 13:
391                 a = fbs(0);
392                 a ^= nbs(1);
393                 text++;
394                 a ^= nbs(3);
395                 a ^= nbs(4);
396                 text += 2;      
397                 a ^= nbs(7);
398                 a ^= nbs(8);
399                 text += 2;
400                 return a ^ nbs(9) ^ nbs(10);
401
402         case 14:
403                 a = fbs(0);
404                 text += 2;      
405                 a ^= nbs(3);
406                 a ^= nbs(4);
407                 text += 2;      
408                 a ^= nbs(7);
409                 a ^= nbs(8);
410                 text += 2;
411                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
412
413         case 15:
414                 a = fbs(0);
415                 text += 2;      
416                 a ^= nbs(3);
417                 a ^= nbs(4);
418                 text += 2;      
419                 a ^= nbs(7);
420                 a ^= nbs(8);
421                 text += 2;
422                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
423
424         default:  /* 3 characters from beginning */
425                 a = fbs(0);
426                 text += 2;
427                 a ^= nbs(3);
428                 a ^= nbs(4);
429
430                 /* 2 characters from middle */
431                 text = start_pos + (length / 2);
432                 a ^= fbs(5);
433                 text += 2;
434                 a ^= nbs(6);    
435
436                 /* 3 characters from end */
437                 text = start_pos + length - 4;
438
439                 a ^= fbs(7);
440                 text++;
441
442                 return a ^ nbs(10) ^ nbs(11);
443     }
444 }
445
446
447 /* utf_hashkey *****************************************************************
448
449    Compute the hashkey of a unicode string.
450
451 *******************************************************************************/
452
453 u4 unicode_hashkey(u2 *text, u2 len)
454 {
455         return utf_hashkey((char *) text, len);
456 }
457
458
459 /* utf_new *********************************************************************
460
461    Creates a new utf-symbol, the text of the symbol is passed as a
462    u1-array. The function searches the utf-hashtable for a utf-symbol
463    with this text. On success the element returned, otherwise a new
464    hashtable element is created.
465
466    If the number of entries in the hashtable exceeds twice the size of
467    the hashtable slots a reorganization of the hashtable is done and
468    the utf symbols are copied to a new hashtable with doubled size.
469
470 *******************************************************************************/
471
472 utf *utf_new(const char *text, u2 length)
473 {
474         u4 key;                             /* hashkey computed from utf-text     */
475         u4 slot;                            /* slot in hashtable                  */
476         utf *u;                             /* hashtable element                  */
477         u2 i;
478
479 #if defined(USE_THREADS)
480         builtin_monitorenter(lock_hashtable_utf);
481 #endif
482
483 #if defined(STATISTICS)
484         if (opt_stat)
485                 count_utf_new++;
486 #endif
487
488         key  = utf_hashkey(text, length);
489         slot = key & (hashtable_utf.size - 1);
490         u    = hashtable_utf.ptr[slot];
491
492         /* search external hash chain for utf-symbol */
493
494         while (u) {
495                 if (u->blength == length) {
496                         /* compare text of hashtable elements */
497
498                         for (i = 0; i < length; i++)
499                                 if (text[i] != u->text[i])
500                                         goto nomatch;
501                         
502 #if defined(STATISTICS)
503                         if (opt_stat)
504                                 count_utf_new_found++;
505 #endif
506
507                         /* symbol found in hashtable */
508
509 #if defined(USE_THREADS)
510                         builtin_monitorexit(lock_hashtable_utf);
511 #endif
512
513                         return u;
514                 }
515
516         nomatch:
517                 u = u->hashlink; /* next element in external chain */
518         }
519
520 #if defined(STATISTICS)
521         if (opt_stat)
522                 count_utf_len += sizeof(utf) + length + 1;
523 #endif
524
525         /* location in hashtable found, create new utf element */
526         u = NEW(utf);
527         u->blength  = length;               /* length in bytes of utfstring       */
528         u->hashlink = hashtable_utf.ptr[slot]; /* link in external hashchain      */
529         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
530
531         memcpy(u->text, text, length);      /* copy utf-text                      */
532         u->text[length] = '\0';
533
534         hashtable_utf.ptr[slot] = u;        /* insert symbol into table           */
535         hashtable_utf.entries++;            /* update number of entries           */
536
537         if (hashtable_utf.entries > (hashtable_utf.size * 2)) {
538
539         /* reorganization of hashtable, average length of the external
540            chains is approx. 2 */
541
542                 hashtable  newhash;                              /* the new hashtable */
543                 u4         i;
544                 utf       *u;
545                 utf       *nextu;
546                 u4         slot;
547
548                 /* create new hashtable, double the size */
549
550                 hashtable_create(&newhash, hashtable_utf.size * 2);
551                 newhash.entries = hashtable_utf.entries;
552
553 #if defined(STATISTICS)
554                 if (opt_stat)
555                         count_utf_len += sizeof(utf*) * hashtable_utf.size;
556 #endif
557
558                 /* transfer elements to new hashtable */
559
560                 for (i = 0; i < hashtable_utf.size; i++) {
561                         u = hashtable_utf.ptr[i];
562
563                         while (u) {
564                                 nextu = u->hashlink;
565                                 slot  = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
566                                                 
567                                 u->hashlink = (utf *) newhash.ptr[slot];
568                                 newhash.ptr[slot] = u;
569
570                                 /* follow link in external hash chain */
571
572                                 u = nextu;
573                         }
574                 }
575         
576                 /* dispose old table */
577
578                 MFREE(hashtable_utf.ptr, void*, hashtable_utf.size);
579                 hashtable_utf = newhash;
580         }
581
582 #if defined(USE_THREADS)
583         builtin_monitorexit(lock_hashtable_utf);
584 #endif
585
586         return u;
587 }
588
589
590 /* utf_new_u2 ******************************************************************
591
592    Make utf symbol from u2 array, if isclassname is true '.' is
593    replaced by '/'.
594
595 *******************************************************************************/
596
597 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
598 {
599         char *buffer;                   /* memory buffer for  unicode characters  */
600         char *pos;                      /* pointer to current position in buffer  */
601         u4 left;                        /* unicode characters left                */
602         u4 buflength;                   /* utf length in bytes of the u2 array    */
603         utf *result;                    /* resulting utf-string                   */
604         int i;          
605
606         /* determine utf length in bytes and allocate memory */
607
608         buflength = u2_utflength(unicode_pos, unicode_length); 
609         buffer    = MNEW(char, buflength);
610  
611         left = buflength;
612         pos  = buffer;
613
614         for (i = 0; i++ < unicode_length; unicode_pos++) {
615                 /* next unicode character */
616                 u2 c = *unicode_pos;
617                 
618                 if ((c != 0) && (c < 0x80)) {
619                         /* 1 character */       
620                         left--;
621                 if ((int) left < 0) break;
622                         /* convert classname */
623                         if (isclassname && c == '.')
624                                 *pos++ = '/';
625                         else
626                                 *pos++ = (char) c;
627
628                 } else if (c < 0x800) {             
629                         /* 2 characters */                              
630                 unsigned char high = c >> 6;
631                 unsigned char low  = c & 0x3F;
632                         left = left - 2;
633                 if ((int) left < 0) break;
634                 *pos++ = high | 0xC0; 
635                 *pos++ = low  | 0x80;     
636
637                 } else {         
638                 /* 3 characters */                              
639                 char low  = c & 0x3f;
640                 char mid  = (c >> 6) & 0x3F;
641                 char high = c >> 12;
642                         left = left - 3;
643                 if ((int) left < 0) break;
644                 *pos++ = high | 0xE0; 
645                 *pos++ = mid  | 0x80;  
646                 *pos++ = low  | 0x80;   
647                 }
648         }
649         
650         /* insert utf-string into symbol-table */
651         result = utf_new(buffer,buflength);
652
653         MFREE(buffer, char, buflength);
654
655         return result;
656 }
657
658
659 /* utf_new_char ****************************************************************
660
661    Creates a new utf symbol, the text for this symbol is passed as a
662    c-string ( = char* ).
663
664 *******************************************************************************/
665
666 utf *utf_new_char(const char *text)
667 {
668         return utf_new(text, strlen(text));
669 }
670
671
672 /* utf_new_char_classname ******************************************************
673
674    Creates a new utf symbol, the text for this symbol is passed as a
675    c-string ( = char* ) "." characters are going to be replaced by
676    "/". Since the above function is used often, this is a separte
677    function, instead of an if.
678
679 *******************************************************************************/
680
681 utf *utf_new_char_classname(const char *text)
682 {
683         if (strchr(text, '.')) {
684                 char *txt = strdup(text);
685                 char *end = txt + strlen(txt);
686                 char *c;
687                 utf *tmpRes;
688
689                 for (c = txt; c < end; c++)
690                         if (*c == '.') *c = '/';
691
692                 tmpRes = utf_new(txt, strlen(txt));
693                 FREE(txt, 0);
694
695                 return tmpRes;
696
697         } else
698                 return utf_new(text, strlen(text));
699 }
700
701
702 /* utf_nextu2 ******************************************************************
703
704    Read the next unicode character from the utf string and increment
705    the utf-string pointer accordingly.
706
707 *******************************************************************************/
708
709 u2 utf_nextu2(char **utf_ptr)
710 {
711     /* uncompressed unicode character */
712     u2 unicode_char = 0;
713     /* current position in utf text */  
714     unsigned char *utf = (unsigned char *) (*utf_ptr);
715     /* bytes representing the unicode character */
716     unsigned char ch1, ch2, ch3;
717     /* number of bytes used to represent the unicode character */
718     int len = 0;
719         
720     switch ((ch1 = utf[0]) >> 4) {
721         default: /* 1 byte */
722                 (*utf_ptr)++;
723                 return (u2) ch1;
724         case 0xC: 
725         case 0xD: /* 2 bytes */
726                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
727                         unsigned char high = ch1 & 0x1F;
728                         unsigned char low  = ch2 & 0x3F;
729                         unicode_char = (high << 6) + low;
730                         len = 2;
731                 }
732                 break;
733
734         case 0xE: /* 2 or 3 bytes */
735                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
736                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
737                                 unsigned char low  = ch3 & 0x3f;
738                                 unsigned char mid  = ch2 & 0x3f;
739                                 unsigned char high = ch1 & 0x0f;
740                                 unicode_char = (((high << 6) + mid) << 6) + low;
741                                 len = 3;
742                         } else
743                                 len = 2;                                           
744                 }
745                 break;
746     }
747
748     /* update position in utf-text */
749     *utf_ptr = (char *) (utf + len);
750
751     return unicode_char;
752 }
753
754
755 /* utf_strlen ******************************************************************
756
757    Determine number of unicode characters in the utf string.
758
759 *******************************************************************************/
760
761 u4 utf_strlen(utf *u)
762 {
763         char *endpos;                       /* points behind utf string           */
764         char *utf_ptr;                      /* current position in utf text       */
765         u4 len = 0;                         /* number of unicode characters       */
766
767         if (!u) {
768                 *exceptionptr = new_nullpointerexception();
769                 return 0;
770         }
771
772         endpos = UTF_END(u);
773         utf_ptr = u->text;
774
775         while (utf_ptr < endpos) {
776                 len++;
777                 /* next unicode character */
778                 utf_nextu2(&utf_ptr);
779         }
780
781         if (utf_ptr != endpos)
782                 /* string ended abruptly */
783                 throw_cacao_exception_exit(string_java_lang_InternalError,
784                                                                    "Illegal utf8 string");
785
786         return len;
787 }
788
789
790 /* u2_utflength ****************************************************************
791
792    Returns the utf length in bytes of a u2 array.
793
794 *******************************************************************************/
795
796 u4 u2_utflength(u2 *text, u4 u2_length)
797 {
798         u4 result_len = 0;                  /* utf length in bytes                */
799         u2 ch;                              /* current unicode character          */
800         u4 len;
801         
802         for (len = 0; len < u2_length; len++) {
803                 /* next unicode character */
804                 ch = *text++;
805           
806                 /* determine bytes required to store unicode character as utf */
807                 if (ch && (ch < 0x80)) 
808                         result_len++;
809                 else if (ch < 0x800)
810                         result_len += 2;        
811                 else 
812                         result_len += 3;        
813         }
814
815     return result_len;
816 }
817
818
819 /* utf_display *****************************************************************
820
821    Write utf symbol to stdout (for debugging purposes).
822
823 *******************************************************************************/
824
825 void utf_display(utf *u)
826 {
827         char *endpos;                       /* points behind utf string           */
828         char *utf_ptr;                      /* current position in utf text       */
829
830         if (!u) {
831                 printf("NULL");
832                 fflush(stdout);
833                 return;
834         }
835
836         endpos = UTF_END(u);
837         utf_ptr = u->text;
838
839         while (utf_ptr < endpos) {
840                 /* read next unicode character */                
841                 u2 c = utf_nextu2(&utf_ptr);
842                 if (c >= 32 && c <= 127) printf("%c", c);
843                 else printf("?");
844         }
845
846         fflush(stdout);
847 }
848
849
850 /* utf_display_classname *******************************************************
851
852    Write utf symbol to stdout with `/' converted to `.' (for debugging
853    purposes).
854
855 *******************************************************************************/
856
857 void utf_display_classname(utf *u)
858 {
859         char *endpos;                       /* points behind utf string           */
860         char *utf_ptr;                      /* current position in utf text       */
861
862         if (!u) {
863                 printf("NULL");
864                 fflush(stdout);
865                 return;
866         }
867
868         endpos = UTF_END(u);
869         utf_ptr = u->text;
870
871         while (utf_ptr < endpos) {
872                 /* read next unicode character */                
873                 u2 c = utf_nextu2(&utf_ptr);
874                 if (c == '/') c = '.';
875                 if (c >= 32 && c <= 127) printf("%c", c);
876                 else printf("?");
877         }
878
879         fflush(stdout);
880 }
881
882
883 /* utf_sprint ******************************************************************
884         
885    Write utf symbol into c-string (for debugging purposes).
886
887 *******************************************************************************/
888
889 void utf_sprint(char *buffer, utf *u)
890 {
891         char *endpos;                       /* points behind utf string           */
892         char *utf_ptr;                      /* current position in utf text       */
893         u2 pos = 0;                         /* position in c-string               */
894
895         if (!u) {
896                 strcpy(buffer, "NULL");
897                 return;
898         }
899
900         endpos = UTF_END(u);
901         utf_ptr = u->text;
902
903         while (utf_ptr < endpos) 
904                 /* copy next unicode character */       
905                 buffer[pos++] = utf_nextu2(&utf_ptr);
906
907         /* terminate string */
908         buffer[pos] = '\0';
909 }
910
911
912 /* utf_sprint_classname ********************************************************
913         
914    Write utf symbol into c-string with `/' converted to `.' (for debugging
915    purposes).
916
917 *******************************************************************************/
918
919 void utf_sprint_classname(char *buffer, utf *u)
920 {
921         char *endpos;                       /* points behind utf string           */
922         char *utf_ptr;                      /* current position in utf text       */
923         u2 pos = 0;                         /* position in c-string               */
924
925         if (!u) {
926                 strcpy(buffer, "NULL");
927                 return;
928         }
929
930         endpos = UTF_END(u);
931         utf_ptr = u->text;
932
933         while (utf_ptr < endpos) {
934                 /* copy next unicode character */       
935                 u2 c = utf_nextu2(&utf_ptr);
936                 if (c == '/') c = '.';
937                 buffer[pos++] = c;
938         }
939
940         /* terminate string */
941         buffer[pos] = '\0';
942 }
943
944
945 /* utf_strcat ******************************************************************
946         
947    Like libc strcat, but uses an utf8 string.
948
949 *******************************************************************************/
950
951 void utf_strcat(char *buffer, utf *u)
952 {
953         utf_sprint(buffer + strlen(buffer), u);
954 }
955
956
957 /* utf_strcat_classname ********************************************************
958         
959    Like libc strcat, but uses an utf8 string.
960
961 *******************************************************************************/
962
963 void utf_strcat_classname(char *buffer, utf *u)
964 {
965         utf_sprint_classname(buffer + strlen(buffer), u);
966 }
967
968
969 /* utf_fprint ******************************************************************
970         
971    Write utf symbol into file.
972
973 *******************************************************************************/
974
975 void utf_fprint(FILE *file, utf *u)
976 {
977         char *endpos;                       /* points behind utf string           */
978         char *utf_ptr;                      /* current position in utf text       */
979
980         if (!u)
981                 return;
982
983         endpos = UTF_END(u);
984         utf_ptr = u->text;
985
986         while (utf_ptr < endpos) { 
987                 /* read next unicode character */                
988                 u2 c = utf_nextu2(&utf_ptr);                            
989
990                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
991                 else fprintf(file, "?");
992         }
993 }
994
995
996 /* utf_fprint_classname ********************************************************
997         
998    Write utf symbol into file with `/' converted to `.'.
999
1000 *******************************************************************************/
1001
1002 void utf_fprint_classname(FILE *file, utf *u)
1003 {
1004         char *endpos;                       /* points behind utf string           */
1005         char *utf_ptr;                      /* current position in utf text       */
1006
1007     if (!u)
1008                 return;
1009
1010         endpos = UTF_END(u);
1011         utf_ptr = u->text;
1012
1013         while (utf_ptr < endpos) { 
1014                 /* read next unicode character */                
1015                 u2 c = utf_nextu2(&utf_ptr);                            
1016                 if (c == '/') c = '.';
1017
1018                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1019                 else fprintf(file, "?");
1020         }
1021 }
1022
1023
1024 /* is_valid_utf ****************************************************************
1025
1026    Return true if the given string is a valid UTF-8 string.
1027
1028    utf_ptr...points to first character
1029    end_pos...points after last character
1030
1031 *******************************************************************************/
1032
1033 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1034
1035 bool is_valid_utf(char *utf_ptr, char *end_pos)
1036 {
1037         int bytes;
1038         int len,i;
1039         char c;
1040         unsigned long v;
1041
1042         if (end_pos < utf_ptr) return false;
1043         bytes = end_pos - utf_ptr;
1044         while (bytes--) {
1045                 c = *utf_ptr++;
1046
1047                 if (!c) return false;                     /* 0x00 is not allowed */
1048                 if ((c & 0x80) == 0) continue;            /* ASCII */
1049
1050                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1051                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1052                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1053                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1054                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1055                 else return false;                        /* invalid leading byte */
1056
1057                 if (len > 2) return false;                /* Java limitation */
1058
1059                 v = (unsigned long)c & (0x3f >> len);
1060                 
1061                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1062
1063                 for (i = len; i--; ) {
1064                         c = *utf_ptr++;
1065                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1066                                 return false;
1067                         v = (v << 6) | (c & 0x3f);
1068                 }
1069
1070                 if (v == 0) {
1071                         if (len != 1) return false;           /* Java special */
1072
1073                 } else {
1074                         /* Sun Java seems to allow overlong UTF-8 encodings */
1075                         
1076                         /* if (v < min_codepoint[len]) */
1077                                 /* XXX throw exception? */
1078                 }
1079
1080                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1081                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1082
1083                 /* even these seem to be allowed */
1084                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1085         }
1086
1087         return true;
1088 }
1089
1090
1091 /* is_valid_name ***************************************************************
1092
1093    Return true if the given string may be used as a class/field/method
1094    name. (Currently this only disallows empty strings and control
1095    characters.)
1096
1097    NOTE: The string is assumed to have passed is_valid_utf!
1098
1099    utf_ptr...points to first character
1100    end_pos...points after last character
1101
1102 *******************************************************************************/
1103
1104 bool is_valid_name(char *utf_ptr, char *end_pos)
1105 {
1106         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1107
1108         while (utf_ptr < end_pos) {
1109                 unsigned char c = *utf_ptr++;
1110
1111                 if (c < 0x20) return false; /* disallow control characters */
1112                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1113                         return false;
1114         }
1115
1116         return true;
1117 }
1118
1119 bool is_valid_name_utf(utf *u)
1120 {
1121         return is_valid_name(u->text, UTF_END(u));
1122 }
1123
1124
1125 /* utf_show ********************************************************************
1126
1127    Writes the utf symbols in the utfhash to stdout and displays the
1128    number of external hash chains grouped according to the chainlength
1129    (for debugging purposes).
1130
1131 *******************************************************************************/
1132
1133 #if !defined(NDEBUG)
1134 void utf_show(void)
1135 {
1136
1137 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1138
1139         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1140         u4 max_chainlength = 0;      /* maximum length of the chains */
1141         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1142         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1143         u4 i;
1144
1145         printf("UTF-HASH:\n");
1146
1147         /* show element of utf-hashtable */
1148
1149         for (i = 0; i < hashtable_utf.size; i++) {
1150                 utf *u = hashtable_utf.ptr[i];
1151
1152                 if (u) {
1153                         printf("SLOT %d: ", (int) i);
1154
1155                         while (u) {
1156                                 printf("'");
1157                                 utf_display(u);
1158                                 printf("' ");
1159                                 u = u->hashlink;
1160                         }       
1161                         printf("\n");
1162                 }
1163         }
1164
1165         printf("UTF-HASH: %d slots for %d entries\n", 
1166                    (int) hashtable_utf.size, (int) hashtable_utf.entries );
1167
1168         if (hashtable_utf.entries == 0)
1169                 return;
1170
1171         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1172
1173         for (i=0;i<CHAIN_LIMIT;i++)
1174                 chain_count[i]=0;
1175
1176         /* count numbers of hashchains according to their length */
1177         for (i=0; i<hashtable_utf.size; i++) {
1178                   
1179                 utf *u = (utf*) hashtable_utf.ptr[i];
1180                 u4 chain_length = 0;
1181
1182                 /* determine chainlength */
1183                 while (u) {
1184                         u = u->hashlink;
1185                         chain_length++;
1186                 }
1187
1188                 /* update sum of all chainlengths */
1189                 sum_chainlength+=chain_length;
1190
1191                 /* determine the maximum length of the chains */
1192                 if (chain_length>max_chainlength)
1193                         max_chainlength = chain_length;
1194
1195                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1196                 if (chain_length>=CHAIN_LIMIT) {
1197                         beyond_limit+=chain_length;
1198                         chain_length=CHAIN_LIMIT-1;
1199                 }
1200
1201                 /* update number of hashchains of current length */
1202                 chain_count[chain_length]++;
1203         }
1204
1205         /* display results */  
1206         for (i=1;i<CHAIN_LIMIT-1;i++) 
1207                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf.entries));
1208           
1209         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf.entries);
1210
1211
1212         printf("max. chainlength:%5d\n",max_chainlength);
1213
1214         /* avg. chainlength = sum of chainlengths / number of chains */
1215         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf.size-chain_count[0]));
1216 }
1217 #endif /* !defined(NDEBUG) */
1218
1219
1220 /*
1221  * These are local overrides for various environment variables in Emacs.
1222  * Please do not remove this and leave it at the end of the file, where
1223  * Emacs will automagically detect them.
1224  * ---------------------------------------------------------------------
1225  * Local variables:
1226  * mode: c
1227  * indent-tabs-mode: t
1228  * c-basic-offset: 4
1229  * tab-width: 4
1230  * End:
1231  */