4f3d6e22aaf23800adf5794e7aedadfcafa281b0
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 3687 2005-11-16 19:13:37Z edwin $
34
35 */
36
37
38 #include <string.h>
39 #include <assert.h>
40
41 #include "mm/memory.h"
42 #include "vm/exceptions.h"
43 #include "vm/options.h"
44 #include "vm/statistics.h"
45 #include "vm/stringlocal.h"
46 #include "vm/tables.h"
47 #include "vm/utf8.h"
48
49 /* global variables ***********************************************************/
50
51 #if defined(USE_THREADS)
52 static java_objectheader *lock_utf_hashtable;
53 #endif
54
55 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
56
57
58 /* utf-symbols for pointer comparison of frequently used strings **************/
59
60 utf *utf_java_lang_Object;              /* java/lang/Object                   */
61
62 utf *utf_java_lang_Class;
63 utf *utf_java_lang_ClassLoader;
64 utf *utf_java_lang_Cloneable;
65 utf *utf_java_lang_SecurityManager;
66 utf *utf_java_lang_String;
67 utf *utf_java_lang_System;
68 utf *utf_java_lang_ThreadGroup;
69 utf *utf_java_io_Serializable;
70
71 utf *utf_java_lang_Throwable;
72 utf *utf_java_lang_VMThrowable;
73 utf *utf_java_lang_Error;
74 utf *utf_java_lang_NoClassDefFoundError;
75 utf *utf_java_lang_NoSuchMethodError;
76 utf *utf_java_lang_OutOfMemoryError;
77
78 utf *utf_java_lang_Exception;
79 utf *utf_java_lang_ClassNotFoundException;
80 utf *utf_java_lang_IllegalArgumentException;
81 utf *utf_java_lang_IllegalMonitorStateException;
82
83 utf *utf_java_lang_NullPointerException;
84
85 utf* utf_java_lang_Void;
86 utf* utf_java_lang_Boolean;
87 utf* utf_java_lang_Byte;
88 utf* utf_java_lang_Character;
89 utf* utf_java_lang_Short;
90 utf* utf_java_lang_Integer;
91 utf* utf_java_lang_Long;
92 utf* utf_java_lang_Float;
93 utf* utf_java_lang_Double;
94
95 utf *utf_java_lang_StackTraceElement;
96 utf *utf_java_lang_reflect_Constructor;
97 utf *utf_java_lang_reflect_Field;
98 utf *utf_java_lang_reflect_Method;
99 utf *utf_java_util_Vector;
100
101 utf *utf_InnerClasses;                  /* InnerClasses                       */
102 utf *utf_ConstantValue;                 /* ConstantValue                      */
103 utf *utf_Code;                          /* Code                               */
104 utf *utf_Exceptions;                    /* Exceptions                         */
105 utf *utf_LineNumberTable;               /* LineNumberTable                    */
106 utf *utf_SourceFile;                    /* SourceFile                         */
107
108 utf *utf_init;                          /* <init>                             */
109 utf *utf_clinit;                        /* <clinit>                           */
110 utf *utf_clone;                         /* clone                              */
111 utf *utf_finalize;                      /* finalize                           */
112 utf *utf_run;                           /* run                                */
113
114 utf *utf_add;                           /* add                                */
115 utf *utf_remove;                        /* remove                             */
116 utf *utf_put;                           /* put                                */
117 utf *utf_get;                           /* get                                */
118 utf *utf_value;                         /* value                              */
119
120 utf *utf_fillInStackTrace;
121 utf *utf_getSystemClassLoader;
122 utf *utf_loadClass;
123 utf *utf_printStackTrace;
124
125 utf *utf_Z;                             /* Z                                  */
126 utf *utf_B;                             /* B                                  */
127 utf *utf_C;                             /* C                                  */
128 utf *utf_S;                             /* S                                  */
129 utf *utf_I;                             /* I                                  */
130 utf *utf_J;                             /* J                                  */
131 utf *utf_F;                             /* F                                  */
132 utf *utf_D;                             /* D                                  */
133
134 utf *utf_void__void;                    /* ()V                                */
135 utf *utf_boolean__void;                 /* (Z)V                               */
136 utf *utf_byte__void;                    /* (B)V                               */
137 utf *utf_char__void;                    /* (C)V                               */
138 utf *utf_short__void;                   /* (S)V                               */
139 utf *utf_int__void;                     /* (I)V                               */
140 utf *utf_long__void;                    /* (J)V                               */
141 utf *utf_float__void;                   /* (F)V                               */
142 utf *utf_double__void;                  /* (D)V                               */
143
144 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
145 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
146 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
147 utf *utf_java_lang_Object__java_lang_Object;
148 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
149 utf *utf_java_lang_String__java_lang_Class;
150 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
151
152 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
153
154 utf *array_packagename;
155
156
157 /* utf_init ********************************************************************
158
159    Initializes the utf8 subsystem.
160
161 *******************************************************************************/
162
163 bool utf8_init(void)
164 {
165 #if defined(USE_THREADS)
166         /* create utf hashtable lock object */
167
168         lock_utf_hashtable = NEW(java_objectheader);
169
170 # if defined(NATIVE_THREADS)
171         initObjectLock(lock_utf_hashtable);
172 # endif
173 #endif
174
175         /* create utf-symbols for pointer comparison of frequently used strings */
176
177         utf_java_lang_Object           = utf_new_char("java/lang/Object");
178
179         utf_java_lang_Class            = utf_new_char("java/lang/Class");
180         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
181         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
182         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
183         utf_java_lang_String           = utf_new_char("java/lang/String");
184         utf_java_lang_System           = utf_new_char("java/lang/System");
185         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
186         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
187
188         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
189         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
190         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
191
192         utf_java_lang_NoClassDefFoundError =
193                 utf_new_char(string_java_lang_NoClassDefFoundError);
194
195         utf_java_lang_NoSuchMethodError =
196                 utf_new_char(string_java_lang_NoSuchMethodError);
197
198         utf_java_lang_OutOfMemoryError =
199                 utf_new_char(string_java_lang_OutOfMemoryError);
200
201         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
202
203         utf_java_lang_ClassNotFoundException =
204                 utf_new_char(string_java_lang_ClassNotFoundException);
205
206         utf_java_lang_IllegalArgumentException =
207                 utf_new_char(string_java_lang_IllegalArgumentException);
208
209         utf_java_lang_IllegalMonitorStateException =
210                 utf_new_char(string_java_lang_IllegalMonitorStateException);
211
212         utf_java_lang_NullPointerException =
213                 utf_new_char(string_java_lang_NullPointerException);
214
215         utf_java_lang_Void             = utf_new_char("java/lang/Void");
216         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
217         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
218         utf_java_lang_Character        = utf_new_char("java/lang/Character");
219         utf_java_lang_Short            = utf_new_char("java/lang/Short");
220         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
221         utf_java_lang_Long             = utf_new_char("java/lang/Long");
222         utf_java_lang_Float            = utf_new_char("java/lang/Float");
223         utf_java_lang_Double           = utf_new_char("java/lang/Double");
224
225         utf_java_lang_StackTraceElement =
226                 utf_new_char("java/lang/StackTraceElement");
227
228         utf_java_lang_reflect_Constructor =
229                 utf_new_char("java/lang/reflect/Constructor");
230
231         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
232         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
233         utf_java_util_Vector           = utf_new_char("java/util/Vector");
234
235         utf_InnerClasses               = utf_new_char("InnerClasses");
236         utf_ConstantValue              = utf_new_char("ConstantValue");
237         utf_Code                       = utf_new_char("Code");
238         utf_Exceptions                 = utf_new_char("Exceptions");
239         utf_LineNumberTable            = utf_new_char("LineNumberTable");
240         utf_SourceFile                 = utf_new_char("SourceFile");
241
242         utf_init                           = utf_new_char("<init>");
243         utf_clinit                         = utf_new_char("<clinit>");
244         utf_clone                      = utf_new_char("clone");
245         utf_finalize                   = utf_new_char("finalize");
246         utf_run                        = utf_new_char("run");
247
248         utf_add                        = utf_new_char("add");
249         utf_remove                     = utf_new_char("remove");
250         utf_put                        = utf_new_char("put");
251         utf_get                        = utf_new_char("get");
252         utf_value                      = utf_new_char("value");
253
254         utf_printStackTrace            = utf_new_char("printStackTrace");
255         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
256         utf_loadClass                  = utf_new_char("loadClass");
257         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
258
259         utf_Z                          = utf_new_char("Z");
260         utf_B                          = utf_new_char("B");
261         utf_C                          = utf_new_char("C");
262         utf_S                          = utf_new_char("S");
263         utf_I                          = utf_new_char("I");
264         utf_J                          = utf_new_char("J");
265         utf_F                          = utf_new_char("F");
266         utf_D                          = utf_new_char("D");
267
268         utf_void__void                 = utf_new_char("()V");
269         utf_boolean__void              = utf_new_char("(Z)V");
270         utf_byte__void                 = utf_new_char("(B)V");
271         utf_char__void                 = utf_new_char("(C)V");
272         utf_short__void                = utf_new_char("(S)V");
273         utf_int__void                  = utf_new_char("(I)V");
274         utf_long__void                 = utf_new_char("(J)V");
275         utf_float__void                = utf_new_char("(F)V");
276         utf_double__void               = utf_new_char("(D)V");
277         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
278         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
279
280         utf_void__java_lang_ClassLoader =
281                 utf_new_char("()Ljava/lang/ClassLoader;");
282
283         utf_java_lang_Object__java_lang_Object =
284                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
285
286         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
287
288         utf_java_lang_String__java_lang_Class =
289                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
290
291         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
292
293         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
294
295         array_packagename              = utf_new_char("\t<the array package>");
296
297         /* everything's ok */
298
299         return true;
300 }
301
302
303 /* utf_hashkey *****************************************************************
304
305    The hashkey is computed from the utf-text by using up to 8
306    characters.  For utf-symbols longer than 15 characters 3 characters
307    are taken from the beginning and the end, 2 characters are taken
308    from the middle.
309
310 *******************************************************************************/
311
312 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
313 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
314
315 u4 utf_hashkey(const char *text, u4 length)
316 {
317         const char *start_pos = text;       /* pointer to utf text                */
318         u4 a;
319
320         switch (length) {
321         case 0: /* empty string */
322                 return 0;
323
324         case 1: return fbs(0);
325         case 2: return fbs(0) ^ nbs(3);
326         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
327         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
328         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
329         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
330         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
331         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
332
333         case 9:
334                 a = fbs(0);
335                 a ^= nbs(1);
336                 a ^= nbs(2);
337                 text++;
338                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
339
340         case 10:
341                 a = fbs(0);
342                 text++;
343                 a ^= nbs(2);
344                 a ^= nbs(3);
345                 a ^= nbs(4);
346                 text++;
347                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
348
349         case 11:
350                 a = fbs(0);
351                 text++;
352                 a ^= nbs(2);
353                 a ^= nbs(3);
354                 a ^= nbs(4);
355                 text++;
356                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
357
358         case 12:
359                 a = fbs(0);
360                 text += 2;
361                 a ^= nbs(2);
362                 a ^= nbs(3);
363                 text++;
364                 a ^= nbs(5);
365                 a ^= nbs(6);
366                 a ^= nbs(7);
367                 text++;
368                 return a ^ nbs(9) ^ nbs(10);
369
370         case 13:
371                 a = fbs(0);
372                 a ^= nbs(1);
373                 text++;
374                 a ^= nbs(3);
375                 a ^= nbs(4);
376                 text += 2;      
377                 a ^= nbs(7);
378                 a ^= nbs(8);
379                 text += 2;
380                 return a ^ nbs(9) ^ nbs(10);
381
382         case 14:
383                 a = fbs(0);
384                 text += 2;      
385                 a ^= nbs(3);
386                 a ^= nbs(4);
387                 text += 2;      
388                 a ^= nbs(7);
389                 a ^= nbs(8);
390                 text += 2;
391                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
392
393         case 15:
394                 a = fbs(0);
395                 text += 2;      
396                 a ^= nbs(3);
397                 a ^= nbs(4);
398                 text += 2;      
399                 a ^= nbs(7);
400                 a ^= nbs(8);
401                 text += 2;
402                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
403
404         default:  /* 3 characters from beginning */
405                 a = fbs(0);
406                 text += 2;
407                 a ^= nbs(3);
408                 a ^= nbs(4);
409
410                 /* 2 characters from middle */
411                 text = start_pos + (length / 2);
412                 a ^= fbs(5);
413                 text += 2;
414                 a ^= nbs(6);    
415
416                 /* 3 characters from end */
417                 text = start_pos + length - 4;
418
419                 a ^= fbs(7);
420                 text++;
421
422                 return a ^ nbs(10) ^ nbs(11);
423     }
424 }
425
426
427 /* utf_hashkey *****************************************************************
428
429    Compute the hashkey of a unicode string.
430
431 *******************************************************************************/
432
433 u4 unicode_hashkey(u2 *text, u2 len)
434 {
435         return utf_hashkey((char *) text, len);
436 }
437
438
439 /* utf_new *********************************************************************
440
441    Creates a new utf-symbol, the text of the symbol is passed as a
442    u1-array. The function searches the utf-hashtable for a utf-symbol
443    with this text. On success the element returned, otherwise a new
444    hashtable element is created.
445
446    If the number of entries in the hashtable exceeds twice the size of
447    the hashtable slots a reorganization of the hashtable is done and
448    the utf symbols are copied to a new hashtable with doubled size.
449
450 *******************************************************************************/
451
452 utf *utf_new(const char *text, u2 length)
453 {
454         u4 key;                             /* hashkey computed from utf-text     */
455         u4 slot;                            /* slot in hashtable                  */
456         utf *u;                             /* hashtable element                  */
457         u2 i;
458
459         /* XXX REMOVE ME! after testing of course ;-) */
460         static int running = 0;
461         /* XXX REMOVE ME! */
462
463 #if defined(USE_THREADS)
464         builtin_monitorenter(lock_utf_hashtable);
465 #endif
466
467         /* XXX REMOVE ME! after testing of course ;-) */
468         assert(running == 0);
469         running = 1;
470         /* XXX REMOVE ME! */
471
472 #ifdef STATISTICS
473         if (opt_stat)
474                 count_utf_new++;
475 #endif
476
477         key  = utf_hashkey(text, length);
478         slot = key & (utf_hash.size - 1);
479         u    = utf_hash.ptr[slot];
480
481         /* search external hash chain for utf-symbol */
482
483         while (u) {
484                 if (u->blength == length) {
485                         /* compare text of hashtable elements */
486
487                         for (i = 0; i < length; i++)
488                                 if (text[i] != u->text[i])
489                                         goto nomatch;
490                         
491 #if defined(STATISTICS)
492                         if (opt_stat)
493                                 count_utf_new_found++;
494 #endif
495
496                         /* symbol found in hashtable */
497
498                         /* XXX REMOVE ME! */
499                         running = 0;
500                         /* XXX REMOVE ME! */
501
502 #if defined(USE_THREADS)
503                         builtin_monitorexit(lock_utf_hashtable);
504 #endif
505
506                         return u;
507                 }
508
509         nomatch:
510                 u = u->hashlink; /* next element in external chain */
511         }
512
513 #if defined(STATISTICS)
514         if (opt_stat)
515                 count_utf_len += sizeof(utf) + length + 1;
516 #endif
517
518         /* location in hashtable found, create new utf element */
519         u = NEW(utf);
520         u->blength  = length;               /* length in bytes of utfstring       */
521         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
522         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
523         memcpy(u->text, text, length);      /* copy utf-text                      */
524         u->text[length] = '\0';
525         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
526
527         utf_hash.entries++;                 /* update number of entries           */
528
529         if (utf_hash.entries > (utf_hash.size * 2)) {
530
531         /* reorganization of hashtable, average length of 
532            the external chains is approx. 2                */  
533
534                 u4 i;
535                 utf *u;
536                 hashtable newhash; /* the new hashtable */
537
538                 /* create new hashtable, double the size */
539                 init_hashtable(&newhash, utf_hash.size * 2);
540                 newhash.entries = utf_hash.entries;
541
542 #ifdef STATISTICS
543                 if (opt_stat)
544                         count_utf_len += sizeof(utf*) * utf_hash.size;
545 #endif
546
547                 /* transfer elements to new hashtable */
548                 for (i = 0; i < utf_hash.size; i++) {
549                         u = (utf *) utf_hash.ptr[i];
550                         while (u) {
551                                 utf *nextu = u->hashlink;
552                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
553                                                 
554                                 u->hashlink = (utf *) newhash.ptr[slot];
555                                 newhash.ptr[slot] = u;
556
557                                 /* follow link in external hash chain */
558                                 u = nextu;
559                         }
560                 }
561         
562                 /* dispose old table */
563                 MFREE(utf_hash.ptr, void*, utf_hash.size);
564                 utf_hash = newhash;
565         }
566
567         /* XXX REMOVE ME! */
568         running = 0;
569         /* XXX REMOVE ME! */
570
571 #if defined(USE_THREADS)
572         builtin_monitorexit(lock_utf_hashtable);
573 #endif
574
575         return u;
576 }
577
578
579 /* utf_new_u2 ******************************************************************
580
581    Make utf symbol from u2 array, if isclassname is true '.' is
582    replaced by '/'.
583
584 *******************************************************************************/
585
586 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
587 {
588         char *buffer;                   /* memory buffer for  unicode characters  */
589         char *pos;                      /* pointer to current position in buffer  */
590         u4 left;                        /* unicode characters left                */
591         u4 buflength;                   /* utf length in bytes of the u2 array    */
592         utf *result;                    /* resulting utf-string                   */
593         int i;          
594
595         /* determine utf length in bytes and allocate memory */
596
597         buflength = u2_utflength(unicode_pos, unicode_length); 
598         buffer    = MNEW(char, buflength);
599  
600         left = buflength;
601         pos  = buffer;
602
603         for (i = 0; i++ < unicode_length; unicode_pos++) {
604                 /* next unicode character */
605                 u2 c = *unicode_pos;
606                 
607                 if ((c != 0) && (c < 0x80)) {
608                         /* 1 character */       
609                         left--;
610                 if ((int) left < 0) break;
611                         /* convert classname */
612                         if (isclassname && c == '.')
613                                 *pos++ = '/';
614                         else
615                                 *pos++ = (char) c;
616
617                 } else if (c < 0x800) {             
618                         /* 2 characters */                              
619                 unsigned char high = c >> 6;
620                 unsigned char low  = c & 0x3F;
621                         left = left - 2;
622                 if ((int) left < 0) break;
623                 *pos++ = high | 0xC0; 
624                 *pos++ = low  | 0x80;     
625
626                 } else {         
627                 /* 3 characters */                              
628                 char low  = c & 0x3f;
629                 char mid  = (c >> 6) & 0x3F;
630                 char high = c >> 12;
631                         left = left - 3;
632                 if ((int) left < 0) break;
633                 *pos++ = high | 0xE0; 
634                 *pos++ = mid  | 0x80;  
635                 *pos++ = low  | 0x80;   
636                 }
637         }
638         
639         /* insert utf-string into symbol-table */
640         result = utf_new(buffer,buflength);
641
642         MFREE(buffer, char, buflength);
643
644         return result;
645 }
646
647
648 /* utf_new_char ****************************************************************
649
650    Creates a new utf symbol, the text for this symbol is passed as a
651    c-string ( = char* ).
652
653 *******************************************************************************/
654
655 utf *utf_new_char(const char *text)
656 {
657         return utf_new(text, strlen(text));
658 }
659
660
661 /* utf_new_char_classname ******************************************************
662
663    Creates a new utf symbol, the text for this symbol is passed as a
664    c-string ( = char* ) "." characters are going to be replaced by
665    "/". Since the above function is used often, this is a separte
666    function, instead of an if.
667
668 *******************************************************************************/
669
670 utf *utf_new_char_classname(const char *text)
671 {
672         if (strchr(text, '.')) {
673                 char *txt = strdup(text);
674                 char *end = txt + strlen(txt);
675                 char *c;
676                 utf *tmpRes;
677
678                 for (c = txt; c < end; c++)
679                         if (*c == '.') *c = '/';
680
681                 tmpRes = utf_new(txt, strlen(txt));
682                 FREE(txt, 0);
683
684                 return tmpRes;
685
686         } else
687                 return utf_new(text, strlen(text));
688 }
689
690
691 /* utf_nextu2 ******************************************************************
692
693    Read the next unicode character from the utf string and increment
694    the utf-string pointer accordingly.
695
696 *******************************************************************************/
697
698 u2 utf_nextu2(char **utf_ptr)
699 {
700     /* uncompressed unicode character */
701     u2 unicode_char = 0;
702     /* current position in utf text */  
703     unsigned char *utf = (unsigned char *) (*utf_ptr);
704     /* bytes representing the unicode character */
705     unsigned char ch1, ch2, ch3;
706     /* number of bytes used to represent the unicode character */
707     int len = 0;
708         
709     switch ((ch1 = utf[0]) >> 4) {
710         default: /* 1 byte */
711                 (*utf_ptr)++;
712                 return (u2) ch1;
713         case 0xC: 
714         case 0xD: /* 2 bytes */
715                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
716                         unsigned char high = ch1 & 0x1F;
717                         unsigned char low  = ch2 & 0x3F;
718                         unicode_char = (high << 6) + low;
719                         len = 2;
720                 }
721                 break;
722
723         case 0xE: /* 2 or 3 bytes */
724                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
725                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
726                                 unsigned char low  = ch3 & 0x3f;
727                                 unsigned char mid  = ch2 & 0x3f;
728                                 unsigned char high = ch1 & 0x0f;
729                                 unicode_char = (((high << 6) + mid) << 6) + low;
730                                 len = 3;
731                         } else
732                                 len = 2;                                           
733                 }
734                 break;
735     }
736
737     /* update position in utf-text */
738     *utf_ptr = (char *) (utf + len);
739
740     return unicode_char;
741 }
742
743
744 /* utf_strlen ******************************************************************
745
746    Determine number of unicode characters in the utf string.
747
748 *******************************************************************************/
749
750 u4 utf_strlen(utf *u)
751 {
752         char *endpos;                       /* points behind utf string           */
753         char *utf_ptr;                      /* current position in utf text       */
754         u4 len = 0;                         /* number of unicode characters       */
755
756         if (!u) {
757                 *exceptionptr = new_nullpointerexception();
758                 return 0;
759         }
760
761         endpos = UTF_END(u);
762         utf_ptr = u->text;
763
764         while (utf_ptr < endpos) {
765                 len++;
766                 /* next unicode character */
767                 utf_nextu2(&utf_ptr);
768         }
769
770         if (utf_ptr != endpos)
771                 /* string ended abruptly */
772                 throw_cacao_exception_exit(string_java_lang_InternalError,
773                                                                    "Illegal utf8 string");
774
775         return len;
776 }
777
778
779 /* u2_utflength ****************************************************************
780
781    Returns the utf length in bytes of a u2 array.
782
783 *******************************************************************************/
784
785 u4 u2_utflength(u2 *text, u4 u2_length)
786 {
787         u4 result_len = 0;                  /* utf length in bytes                */
788         u2 ch;                              /* current unicode character          */
789         u4 len;
790         
791         for (len = 0; len < u2_length; len++) {
792                 /* next unicode character */
793                 ch = *text++;
794           
795                 /* determine bytes required to store unicode character as utf */
796                 if (ch && (ch < 0x80)) 
797                         result_len++;
798                 else if (ch < 0x800)
799                         result_len += 2;        
800                 else 
801                         result_len += 3;        
802         }
803
804     return result_len;
805 }
806
807
808 /* utf_display *****************************************************************
809
810    Write utf symbol to stdout (for debugging purposes).
811
812 *******************************************************************************/
813
814 void utf_display(utf *u)
815 {
816         char *endpos;                       /* points behind utf string           */
817         char *utf_ptr;                      /* current position in utf text       */
818
819         if (!u) {
820                 printf("NULL");
821                 fflush(stdout);
822                 return;
823         }
824
825         endpos = UTF_END(u);
826         utf_ptr = u->text;
827
828         while (utf_ptr < endpos) {
829                 /* read next unicode character */                
830                 u2 c = utf_nextu2(&utf_ptr);
831                 if (c >= 32 && c <= 127) printf("%c", c);
832                 else printf("?");
833         }
834
835         fflush(stdout);
836 }
837
838
839 /* utf_display_classname *******************************************************
840
841    Write utf symbol to stdout with `/' converted to `.' (for debugging
842    purposes).
843
844 *******************************************************************************/
845
846 void utf_display_classname(utf *u)
847 {
848         char *endpos;                       /* points behind utf string           */
849         char *utf_ptr;                      /* current position in utf text       */
850
851         if (!u) {
852                 printf("NULL");
853                 fflush(stdout);
854                 return;
855         }
856
857         endpos = UTF_END(u);
858         utf_ptr = u->text;
859
860         while (utf_ptr < endpos) {
861                 /* read next unicode character */                
862                 u2 c = utf_nextu2(&utf_ptr);
863                 if (c == '/') c = '.';
864                 if (c >= 32 && c <= 127) printf("%c", c);
865                 else printf("?");
866         }
867
868         fflush(stdout);
869 }
870
871
872 /* utf_sprint ******************************************************************
873         
874    Write utf symbol into c-string (for debugging purposes).
875
876 *******************************************************************************/
877
878 void utf_sprint(char *buffer, utf *u)
879 {
880         char *endpos;                       /* points behind utf string           */
881         char *utf_ptr;                      /* current position in utf text       */
882         u2 pos = 0;                         /* position in c-string               */
883
884         if (!u) {
885                 strcpy(buffer, "NULL");
886                 return;
887         }
888
889         endpos = UTF_END(u);
890         utf_ptr = u->text;
891
892         while (utf_ptr < endpos) 
893                 /* copy next unicode character */       
894                 buffer[pos++] = utf_nextu2(&utf_ptr);
895
896         /* terminate string */
897         buffer[pos] = '\0';
898 }
899
900
901 /* utf_sprint_classname ********************************************************
902         
903    Write utf symbol into c-string with `/' converted to `.' (for debugging
904    purposes).
905
906 *******************************************************************************/
907
908 void utf_sprint_classname(char *buffer, utf *u)
909 {
910         char *endpos;                       /* points behind utf string           */
911         char *utf_ptr;                      /* current position in utf text       */
912         u2 pos = 0;                         /* position in c-string               */
913
914         if (!u) {
915                 strcpy(buffer, "NULL");
916                 return;
917         }
918
919         endpos = UTF_END(u);
920         utf_ptr = u->text;
921
922         while (utf_ptr < endpos) {
923                 /* copy next unicode character */       
924                 u2 c = utf_nextu2(&utf_ptr);
925                 if (c == '/') c = '.';
926                 buffer[pos++] = c;
927         }
928
929         /* terminate string */
930         buffer[pos] = '\0';
931 }
932
933
934 /* utf_strcat ******************************************************************
935         
936    Like libc strcat, but uses an utf8 string.
937
938 *******************************************************************************/
939
940 void utf_strcat(char *buffer, utf *u)
941 {
942         utf_sprint(buffer + strlen(buffer), u);
943 }
944
945
946 /* utf_strcat_classname ********************************************************
947         
948    Like libc strcat, but uses an utf8 string.
949
950 *******************************************************************************/
951
952 void utf_strcat_classname(char *buffer, utf *u)
953 {
954         utf_sprint_classname(buffer + strlen(buffer), u);
955 }
956
957
958 /* utf_fprint ******************************************************************
959         
960    Write utf symbol into file.
961
962 *******************************************************************************/
963
964 void utf_fprint(FILE *file, utf *u)
965 {
966         char *endpos;                       /* points behind utf string           */
967         char *utf_ptr;                      /* current position in utf text       */
968
969         if (!u)
970                 return;
971
972         endpos = UTF_END(u);
973         utf_ptr = u->text;
974
975         while (utf_ptr < endpos) { 
976                 /* read next unicode character */                
977                 u2 c = utf_nextu2(&utf_ptr);                            
978
979                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
980                 else fprintf(file, "?");
981         }
982 }
983
984
985 /* utf_fprint_classname ********************************************************
986         
987    Write utf symbol into file with `/' converted to `.'.
988
989 *******************************************************************************/
990
991 void utf_fprint_classname(FILE *file, utf *u)
992 {
993         char *endpos;                       /* points behind utf string           */
994         char *utf_ptr;                      /* current position in utf text       */
995
996     if (!u)
997                 return;
998
999         endpos = UTF_END(u);
1000         utf_ptr = u->text;
1001
1002         while (utf_ptr < endpos) { 
1003                 /* read next unicode character */                
1004                 u2 c = utf_nextu2(&utf_ptr);                            
1005                 if (c == '/') c = '.';
1006
1007                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1008                 else fprintf(file, "?");
1009         }
1010 }
1011
1012
1013 /* is_valid_utf ****************************************************************
1014
1015    Return true if the given string is a valid UTF-8 string.
1016
1017    utf_ptr...points to first character
1018    end_pos...points after last character
1019
1020 *******************************************************************************/
1021
1022 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1023
1024 bool is_valid_utf(char *utf_ptr, char *end_pos)
1025 {
1026         int bytes;
1027         int len,i;
1028         char c;
1029         unsigned long v;
1030
1031         if (end_pos < utf_ptr) return false;
1032         bytes = end_pos - utf_ptr;
1033         while (bytes--) {
1034                 c = *utf_ptr++;
1035
1036                 if (!c) return false;                     /* 0x00 is not allowed */
1037                 if ((c & 0x80) == 0) continue;            /* ASCII */
1038
1039                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1040                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1041                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1042                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1043                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1044                 else return false;                        /* invalid leading byte */
1045
1046                 if (len > 2) return false;                /* Java limitation */
1047
1048                 v = (unsigned long)c & (0x3f >> len);
1049                 
1050                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1051
1052                 for (i = len; i--; ) {
1053                         c = *utf_ptr++;
1054                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1055                                 return false;
1056                         v = (v << 6) | (c & 0x3f);
1057                 }
1058
1059                 if (v == 0) {
1060                         if (len != 1) return false;           /* Java special */
1061
1062                 } else {
1063                         /* Sun Java seems to allow overlong UTF-8 encodings */
1064                         
1065                         /* if (v < min_codepoint[len]) */
1066                                 /* XXX throw exception? */
1067                 }
1068
1069                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1070                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1071
1072                 /* even these seem to be allowed */
1073                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1074         }
1075
1076         return true;
1077 }
1078
1079
1080 /* is_valid_name ***************************************************************
1081
1082    Return true if the given string may be used as a class/field/method
1083    name. (Currently this only disallows empty strings and control
1084    characters.)
1085
1086    NOTE: The string is assumed to have passed is_valid_utf!
1087
1088    utf_ptr...points to first character
1089    end_pos...points after last character
1090
1091 *******************************************************************************/
1092
1093 bool is_valid_name(char *utf_ptr, char *end_pos)
1094 {
1095         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1096
1097         while (utf_ptr < end_pos) {
1098                 unsigned char c = *utf_ptr++;
1099
1100                 if (c < 0x20) return false; /* disallow control characters */
1101                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1102                         return false;
1103         }
1104
1105         return true;
1106 }
1107
1108 bool is_valid_name_utf(utf *u)
1109 {
1110         return is_valid_name(u->text, UTF_END(u));
1111 }
1112
1113
1114 /* utf_show ********************************************************************
1115
1116    Writes the utf symbols in the utfhash to stdout and displays the
1117    number of external hash chains grouped according to the chainlength
1118    (for debugging purposes).
1119
1120 *******************************************************************************/
1121
1122 void utf_show(void)
1123 {
1124
1125 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1126
1127         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1128         u4 max_chainlength = 0;      /* maximum length of the chains */
1129         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1130         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1131         u4 i;
1132
1133         printf ("UTF-HASH:\n");
1134
1135         /* show element of utf-hashtable */
1136         for (i=0; i<utf_hash.size; i++) {
1137                 utf *u = utf_hash.ptr[i];
1138                 if (u) {
1139                         printf ("SLOT %d: ", (int) i);
1140                         while (u) {
1141                                 printf ("'");
1142                                 utf_display (u);
1143                                 printf ("' ");
1144                                 u = u->hashlink;
1145                         }       
1146                         printf ("\n");
1147                 }
1148                 
1149         }
1150
1151         printf ("UTF-HASH: %d slots for %d entries\n", 
1152                         (int) utf_hash.size, (int) utf_hash.entries );
1153
1154
1155         if (utf_hash.entries == 0)
1156                 return;
1157
1158         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1159
1160         for (i=0;i<CHAIN_LIMIT;i++)
1161                 chain_count[i]=0;
1162
1163         /* count numbers of hashchains according to their length */
1164         for (i=0; i<utf_hash.size; i++) {
1165                   
1166                 utf *u = (utf*) utf_hash.ptr[i];
1167                 u4 chain_length = 0;
1168
1169                 /* determine chainlength */
1170                 while (u) {
1171                         u = u->hashlink;
1172                         chain_length++;
1173                 }
1174
1175                 /* update sum of all chainlengths */
1176                 sum_chainlength+=chain_length;
1177
1178                 /* determine the maximum length of the chains */
1179                 if (chain_length>max_chainlength)
1180                         max_chainlength = chain_length;
1181
1182                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1183                 if (chain_length>=CHAIN_LIMIT) {
1184                         beyond_limit+=chain_length;
1185                         chain_length=CHAIN_LIMIT-1;
1186                 }
1187
1188                 /* update number of hashchains of current length */
1189                 chain_count[chain_length]++;
1190         }
1191
1192         /* display results */  
1193         for (i=1;i<CHAIN_LIMIT-1;i++) 
1194                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1195           
1196         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1197
1198
1199         printf("max. chainlength:%5d\n",max_chainlength);
1200
1201         /* avg. chainlength = sum of chainlengths / number of chains */
1202         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1203 }
1204
1205
1206 /*
1207  * These are local overrides for various environment variables in Emacs.
1208  * Please do not remove this and leave it at the end of the file, where
1209  * Emacs will automagically detect them.
1210  * ---------------------------------------------------------------------
1211  * Local variables:
1212  * mode: c
1213  * indent-tabs-mode: t
1214  * c-basic-offset: 4
1215  * tab-width: 4
1216  * End:
1217  */