* src/vm/utf8.c: Removed superfluous comment.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 4689 2006-03-27 11:15:44Z twisti $
34
35 */
36
37
38 #include <string.h>
39 #include <assert.h>
40
41 #include "config.h"
42 #include "vm/types.h"
43
44 #include "mm/memory.h"
45
46 #if defined(USE_THREADS)
47 # if defined(NATIVE_THREADS)
48 #  include "threads/native/threads.h"
49 # else
50 #  include "threads/green/threads.h"
51 # endif
52 #endif
53
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
60 #include "vm/utf8.h"
61
62 /* global variables ***********************************************************/
63
64 /* hashsize must be power of 2 */
65
66 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
67
68 hashtable hashtable_utf;                /* hashtable for utf8-symbols         */
69
70 #if defined(USE_THREADS)
71 static java_objectheader *lock_hashtable_utf;
72 #endif
73
74
75 /* utf-symbols for pointer comparison of frequently used strings **************/
76
77 utf *utf_java_lang_Object;
78
79 utf *utf_java_lang_Class;
80 utf *utf_java_lang_ClassLoader;
81 utf *utf_java_lang_Cloneable;
82 utf *utf_java_lang_SecurityManager;
83 utf *utf_java_lang_String;
84 utf *utf_java_lang_System;
85 utf *utf_java_lang_ThreadGroup;
86 utf *utf_java_io_Serializable;
87
88 utf *utf_java_lang_Throwable;
89 utf *utf_java_lang_VMThrowable;
90 utf *utf_java_lang_Error;
91 utf *utf_java_lang_NoClassDefFoundError;
92 utf *utf_java_lang_LinkageError;
93 utf *utf_java_lang_NoSuchMethodError;
94 utf *utf_java_lang_OutOfMemoryError;
95
96 utf *utf_java_lang_Exception;
97 utf *utf_java_lang_ClassNotFoundException;
98 utf *utf_java_lang_IllegalArgumentException;
99 utf *utf_java_lang_IllegalMonitorStateException;
100
101 utf *utf_java_lang_NullPointerException;
102
103 utf* utf_java_lang_Void;
104 utf* utf_java_lang_Boolean;
105 utf* utf_java_lang_Byte;
106 utf* utf_java_lang_Character;
107 utf* utf_java_lang_Short;
108 utf* utf_java_lang_Integer;
109 utf* utf_java_lang_Long;
110 utf* utf_java_lang_Float;
111 utf* utf_java_lang_Double;
112
113 utf *utf_java_lang_StackTraceElement;
114 utf *utf_java_lang_reflect_Constructor;
115 utf *utf_java_lang_reflect_Field;
116 utf *utf_java_lang_reflect_Method;
117 utf *utf_java_util_Vector;
118
119 utf *utf_InnerClasses;                  /* InnerClasses                       */
120 utf *utf_ConstantValue;                 /* ConstantValue                      */
121 utf *utf_Code;                          /* Code                               */
122 utf *utf_Exceptions;                    /* Exceptions                         */
123 utf *utf_LineNumberTable;               /* LineNumberTable                    */
124 utf *utf_SourceFile;                    /* SourceFile                         */
125
126 utf *utf_init;                          /* <init>                             */
127 utf *utf_clinit;                        /* <clinit>                           */
128 utf *utf_clone;                         /* clone                              */
129 utf *utf_finalize;                      /* finalize                           */
130 utf *utf_run;                           /* run                                */
131
132 utf *utf_add;                           /* add                                */
133 utf *utf_remove;                        /* remove                             */
134 utf *utf_put;                           /* put                                */
135 utf *utf_get;                           /* get                                */
136 utf *utf_value;                         /* value                              */
137
138 utf *utf_fillInStackTrace;
139 utf *utf_getSystemClassLoader;
140 utf *utf_loadClass;
141 utf *utf_printStackTrace;
142
143 utf *utf_Z;                             /* Z                                  */
144 utf *utf_B;                             /* B                                  */
145 utf *utf_C;                             /* C                                  */
146 utf *utf_S;                             /* S                                  */
147 utf *utf_I;                             /* I                                  */
148 utf *utf_J;                             /* J                                  */
149 utf *utf_F;                             /* F                                  */
150 utf *utf_D;                             /* D                                  */
151
152 utf *utf_void__void;                    /* ()V                                */
153 utf *utf_boolean__void;                 /* (Z)V                               */
154 utf *utf_byte__void;                    /* (B)V                               */
155 utf *utf_char__void;                    /* (C)V                               */
156 utf *utf_short__void;                   /* (S)V                               */
157 utf *utf_int__void;                     /* (I)V                               */
158 utf *utf_long__void;                    /* (J)V                               */
159 utf *utf_float__void;                   /* (F)V                               */
160 utf *utf_double__void;                  /* (D)V                               */
161
162 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
163 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
164 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
165 utf *utf_java_lang_Object__java_lang_Object;
166 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
167 utf *utf_java_lang_String__java_lang_Class;
168 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
169
170 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
171
172 utf *array_packagename;
173
174
175 /* utf_init ********************************************************************
176
177    Initializes the utf8 subsystem.
178
179 *******************************************************************************/
180
181 bool utf8_init(void)
182 {
183         /* create utf8 hashtable */
184
185         hashtable_create(&hashtable_utf, HASHTABLE_UTF_SIZE);
186
187 #if defined(ENABLE_STATISTICS)
188         if (opt_stat)
189                 count_utf_len += sizeof(utf*) * hashtable_utf.size;
190 #endif
191
192 #if defined(USE_THREADS)
193         /* create utf hashtable lock object */
194
195         lock_hashtable_utf = NEW(java_objectheader);
196
197 # if defined(NATIVE_THREADS)
198         initObjectLock(lock_hashtable_utf);
199 # endif
200 #endif
201
202         /* create utf-symbols for pointer comparison of frequently used strings */
203
204         utf_java_lang_Object           = utf_new_char("java/lang/Object");
205
206         utf_java_lang_Class            = utf_new_char("java/lang/Class");
207         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
208         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
209         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
210         utf_java_lang_String           = utf_new_char("java/lang/String");
211         utf_java_lang_System           = utf_new_char("java/lang/System");
212         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
213         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
214
215         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
216         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
217         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
218
219         utf_java_lang_NoClassDefFoundError =
220                 utf_new_char(string_java_lang_NoClassDefFoundError);
221
222         utf_java_lang_LinkageError =
223                 utf_new_char(string_java_lang_LinkageError);
224
225         utf_java_lang_NoSuchMethodError =
226                 utf_new_char(string_java_lang_NoSuchMethodError);
227
228         utf_java_lang_OutOfMemoryError =
229                 utf_new_char(string_java_lang_OutOfMemoryError);
230
231         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
232
233         utf_java_lang_ClassNotFoundException =
234                 utf_new_char(string_java_lang_ClassNotFoundException);
235
236         utf_java_lang_IllegalArgumentException =
237                 utf_new_char(string_java_lang_IllegalArgumentException);
238
239         utf_java_lang_IllegalMonitorStateException =
240                 utf_new_char(string_java_lang_IllegalMonitorStateException);
241
242         utf_java_lang_NullPointerException =
243                 utf_new_char(string_java_lang_NullPointerException);
244
245         utf_java_lang_Void             = utf_new_char("java/lang/Void");
246         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
247         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
248         utf_java_lang_Character        = utf_new_char("java/lang/Character");
249         utf_java_lang_Short            = utf_new_char("java/lang/Short");
250         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
251         utf_java_lang_Long             = utf_new_char("java/lang/Long");
252         utf_java_lang_Float            = utf_new_char("java/lang/Float");
253         utf_java_lang_Double           = utf_new_char("java/lang/Double");
254
255         utf_java_lang_StackTraceElement =
256                 utf_new_char("java/lang/StackTraceElement");
257
258         utf_java_lang_reflect_Constructor =
259                 utf_new_char("java/lang/reflect/Constructor");
260
261         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
262         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
263         utf_java_util_Vector           = utf_new_char("java/util/Vector");
264
265         utf_InnerClasses               = utf_new_char("InnerClasses");
266         utf_ConstantValue              = utf_new_char("ConstantValue");
267         utf_Code                       = utf_new_char("Code");
268         utf_Exceptions                 = utf_new_char("Exceptions");
269         utf_LineNumberTable            = utf_new_char("LineNumberTable");
270         utf_SourceFile                 = utf_new_char("SourceFile");
271
272         utf_init                           = utf_new_char("<init>");
273         utf_clinit                         = utf_new_char("<clinit>");
274         utf_clone                      = utf_new_char("clone");
275         utf_finalize                   = utf_new_char("finalize");
276         utf_run                        = utf_new_char("run");
277
278         utf_add                        = utf_new_char("add");
279         utf_remove                     = utf_new_char("remove");
280         utf_put                        = utf_new_char("put");
281         utf_get                        = utf_new_char("get");
282         utf_value                      = utf_new_char("value");
283
284         utf_printStackTrace            = utf_new_char("printStackTrace");
285         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
286         utf_loadClass                  = utf_new_char("loadClass");
287         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
288
289         utf_Z                          = utf_new_char("Z");
290         utf_B                          = utf_new_char("B");
291         utf_C                          = utf_new_char("C");
292         utf_S                          = utf_new_char("S");
293         utf_I                          = utf_new_char("I");
294         utf_J                          = utf_new_char("J");
295         utf_F                          = utf_new_char("F");
296         utf_D                          = utf_new_char("D");
297
298         utf_void__void                 = utf_new_char("()V");
299         utf_boolean__void              = utf_new_char("(Z)V");
300         utf_byte__void                 = utf_new_char("(B)V");
301         utf_char__void                 = utf_new_char("(C)V");
302         utf_short__void                = utf_new_char("(S)V");
303         utf_int__void                  = utf_new_char("(I)V");
304         utf_long__void                 = utf_new_char("(J)V");
305         utf_float__void                = utf_new_char("(F)V");
306         utf_double__void               = utf_new_char("(D)V");
307         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
308         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
309
310         utf_void__java_lang_ClassLoader =
311                 utf_new_char("()Ljava/lang/ClassLoader;");
312
313         utf_java_lang_Object__java_lang_Object =
314                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
315
316         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
317
318         utf_java_lang_String__java_lang_Class =
319                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
320
321         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
322
323         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
324
325         array_packagename              = utf_new_char("\t<the array package>");
326
327         /* everything's ok */
328
329         return true;
330 }
331
332
333 /* utf_hashkey *****************************************************************
334
335    The hashkey is computed from the utf-text by using up to 8
336    characters.  For utf-symbols longer than 15 characters 3 characters
337    are taken from the beginning and the end, 2 characters are taken
338    from the middle.
339
340 *******************************************************************************/
341
342 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
343 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
344
345 u4 utf_hashkey(const char *text, u4 length)
346 {
347         const char *start_pos = text;       /* pointer to utf text                */
348         u4 a;
349
350         switch (length) {
351         case 0: /* empty string */
352                 return 0;
353
354         case 1: return fbs(0);
355         case 2: return fbs(0) ^ nbs(3);
356         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
357         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
358         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
359         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
360         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
361         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
362
363         case 9:
364                 a = fbs(0);
365                 a ^= nbs(1);
366                 a ^= nbs(2);
367                 text++;
368                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
369
370         case 10:
371                 a = fbs(0);
372                 text++;
373                 a ^= nbs(2);
374                 a ^= nbs(3);
375                 a ^= nbs(4);
376                 text++;
377                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
378
379         case 11:
380                 a = fbs(0);
381                 text++;
382                 a ^= nbs(2);
383                 a ^= nbs(3);
384                 a ^= nbs(4);
385                 text++;
386                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
387
388         case 12:
389                 a = fbs(0);
390                 text += 2;
391                 a ^= nbs(2);
392                 a ^= nbs(3);
393                 text++;
394                 a ^= nbs(5);
395                 a ^= nbs(6);
396                 a ^= nbs(7);
397                 text++;
398                 return a ^ nbs(9) ^ nbs(10);
399
400         case 13:
401                 a = fbs(0);
402                 a ^= nbs(1);
403                 text++;
404                 a ^= nbs(3);
405                 a ^= nbs(4);
406                 text += 2;      
407                 a ^= nbs(7);
408                 a ^= nbs(8);
409                 text += 2;
410                 return a ^ nbs(9) ^ nbs(10);
411
412         case 14:
413                 a = fbs(0);
414                 text += 2;      
415                 a ^= nbs(3);
416                 a ^= nbs(4);
417                 text += 2;      
418                 a ^= nbs(7);
419                 a ^= nbs(8);
420                 text += 2;
421                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
422
423         case 15:
424                 a = fbs(0);
425                 text += 2;      
426                 a ^= nbs(3);
427                 a ^= nbs(4);
428                 text += 2;      
429                 a ^= nbs(7);
430                 a ^= nbs(8);
431                 text += 2;
432                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
433
434         default:  /* 3 characters from beginning */
435                 a = fbs(0);
436                 text += 2;
437                 a ^= nbs(3);
438                 a ^= nbs(4);
439
440                 /* 2 characters from middle */
441                 text = start_pos + (length / 2);
442                 a ^= fbs(5);
443                 text += 2;
444                 a ^= nbs(6);    
445
446                 /* 3 characters from end */
447                 text = start_pos + length - 4;
448
449                 a ^= fbs(7);
450                 text++;
451
452                 return a ^ nbs(10) ^ nbs(11);
453     }
454 }
455
456 /* utf_full_hashkey ************************************************************
457
458    This function computes a hash value using all bytes in the string.
459
460    The algorithm is the "One-at-a-time" algorithm as published
461    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
462
463 *******************************************************************************/
464
465 u4 utf_full_hashkey(const char *text, u4 length)
466 {
467         register const unsigned char *p = (const unsigned char *) text;
468         register u4 hash;
469         register u4 i;
470
471         hash = 0;
472         for (i=length; i--;)
473         {
474             hash += *p++;
475             hash += (hash << 10);
476             hash ^= (hash >> 6);
477         }
478         hash += (hash << 3);
479         hash ^= (hash >> 11);
480         hash += (hash << 15);
481
482         return hash;
483 }
484
485 /* unicode_hashkey *************************************************************
486
487    Compute the hashkey of a unicode string.
488
489 *******************************************************************************/
490
491 u4 unicode_hashkey(u2 *text, u2 len)
492 {
493         return utf_hashkey((char *) text, len);
494 }
495
496
497 /* utf_new *********************************************************************
498
499    Creates a new utf-symbol, the text of the symbol is passed as a
500    u1-array. The function searches the utf-hashtable for a utf-symbol
501    with this text. On success the element returned, otherwise a new
502    hashtable element is created.
503
504    If the number of entries in the hashtable exceeds twice the size of
505    the hashtable slots a reorganization of the hashtable is done and
506    the utf symbols are copied to a new hashtable with doubled size.
507
508 *******************************************************************************/
509
510 utf *utf_new(const char *text, u2 length)
511 {
512         u4 key;                             /* hashkey computed from utf-text     */
513         u4 slot;                            /* slot in hashtable                  */
514         utf *u;                             /* hashtable element                  */
515         u2 i;
516
517 #if defined(USE_THREADS)
518         builtin_monitorenter(lock_hashtable_utf);
519 #endif
520
521 #if defined(ENABLE_STATISTICS)
522         if (opt_stat)
523                 count_utf_new++;
524 #endif
525
526         key  = utf_hashkey(text, length);
527         slot = key & (hashtable_utf.size - 1);
528         u    = hashtable_utf.ptr[slot];
529
530         /* search external hash chain for utf-symbol */
531
532         while (u) {
533                 if (u->blength == length) {
534                         /* compare text of hashtable elements */
535
536                         for (i = 0; i < length; i++)
537                                 if (text[i] != u->text[i])
538                                         goto nomatch;
539                         
540 #if defined(ENABLE_STATISTICS)
541                         if (opt_stat)
542                                 count_utf_new_found++;
543 #endif
544
545                         /* symbol found in hashtable */
546
547 #if defined(USE_THREADS)
548                         builtin_monitorexit(lock_hashtable_utf);
549 #endif
550
551                         return u;
552                 }
553
554         nomatch:
555                 u = u->hashlink; /* next element in external chain */
556         }
557
558 #if defined(ENABLE_STATISTICS)
559         if (opt_stat)
560                 count_utf_len += sizeof(utf) + length + 1;
561 #endif
562
563         /* location in hashtable found, create new utf element */
564         u = NEW(utf);
565         u->blength  = length;               /* length in bytes of utfstring       */
566         u->hashlink = hashtable_utf.ptr[slot]; /* link in external hashchain      */
567         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
568
569         memcpy(u->text, text, length);      /* copy utf-text                      */
570         u->text[length] = '\0';
571
572         hashtable_utf.ptr[slot] = u;        /* insert symbol into table           */
573         hashtable_utf.entries++;            /* update number of entries           */
574
575         if (hashtable_utf.entries > (hashtable_utf.size * 2)) {
576
577         /* reorganization of hashtable, average length of the external
578            chains is approx. 2 */
579
580                 hashtable  newhash;                              /* the new hashtable */
581                 u4         i;
582                 utf       *u;
583                 utf       *nextu;
584                 u4         slot;
585
586                 /* create new hashtable, double the size */
587
588                 hashtable_create(&newhash, hashtable_utf.size * 2);
589                 newhash.entries = hashtable_utf.entries;
590
591 #if defined(ENABLE_STATISTICS)
592                 if (opt_stat)
593                         count_utf_len += sizeof(utf*) * hashtable_utf.size;
594 #endif
595
596                 /* transfer elements to new hashtable */
597
598                 for (i = 0; i < hashtable_utf.size; i++) {
599                         u = hashtable_utf.ptr[i];
600
601                         while (u) {
602                                 nextu = u->hashlink;
603                                 slot  = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
604                                                 
605                                 u->hashlink = (utf *) newhash.ptr[slot];
606                                 newhash.ptr[slot] = u;
607
608                                 /* follow link in external hash chain */
609
610                                 u = nextu;
611                         }
612                 }
613         
614                 /* dispose old table */
615
616                 MFREE(hashtable_utf.ptr, void*, hashtable_utf.size);
617                 hashtable_utf = newhash;
618         }
619
620 #if defined(USE_THREADS)
621         builtin_monitorexit(lock_hashtable_utf);
622 #endif
623
624         return u;
625 }
626
627
628 /* utf_new_u2 ******************************************************************
629
630    Make utf symbol from u2 array, if isclassname is true '.' is
631    replaced by '/'.
632
633 *******************************************************************************/
634
635 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
636 {
637         char *buffer;                   /* memory buffer for  unicode characters  */
638         char *pos;                      /* pointer to current position in buffer  */
639         u4 left;                        /* unicode characters left                */
640         u4 buflength;                   /* utf length in bytes of the u2 array    */
641         utf *result;                    /* resulting utf-string                   */
642         int i;          
643
644         /* determine utf length in bytes and allocate memory */
645
646         buflength = u2_utflength(unicode_pos, unicode_length); 
647         buffer    = MNEW(char, buflength);
648  
649         left = buflength;
650         pos  = buffer;
651
652         for (i = 0; i++ < unicode_length; unicode_pos++) {
653                 /* next unicode character */
654                 u2 c = *unicode_pos;
655                 
656                 if ((c != 0) && (c < 0x80)) {
657                         /* 1 character */       
658                         left--;
659                 if ((int) left < 0) break;
660                         /* convert classname */
661                         if (isclassname && c == '.')
662                                 *pos++ = '/';
663                         else
664                                 *pos++ = (char) c;
665
666                 } else if (c < 0x800) {             
667                         /* 2 characters */                              
668                 unsigned char high = c >> 6;
669                 unsigned char low  = c & 0x3F;
670                         left = left - 2;
671                 if ((int) left < 0) break;
672                 *pos++ = high | 0xC0; 
673                 *pos++ = low  | 0x80;     
674
675                 } else {         
676                 /* 3 characters */                              
677                 char low  = c & 0x3f;
678                 char mid  = (c >> 6) & 0x3F;
679                 char high = c >> 12;
680                         left = left - 3;
681                 if ((int) left < 0) break;
682                 *pos++ = high | 0xE0; 
683                 *pos++ = mid  | 0x80;  
684                 *pos++ = low  | 0x80;   
685                 }
686         }
687         
688         /* insert utf-string into symbol-table */
689         result = utf_new(buffer,buflength);
690
691         MFREE(buffer, char, buflength);
692
693         return result;
694 }
695
696
697 /* utf_new_char ****************************************************************
698
699    Creates a new utf symbol, the text for this symbol is passed as a
700    c-string ( = char* ).
701
702 *******************************************************************************/
703
704 utf *utf_new_char(const char *text)
705 {
706         return utf_new(text, strlen(text));
707 }
708
709
710 /* utf_new_char_classname ******************************************************
711
712    Creates a new utf symbol, the text for this symbol is passed as a
713    c-string ( = char* ) "." characters are going to be replaced by
714    "/". Since the above function is used often, this is a separte
715    function, instead of an if.
716
717 *******************************************************************************/
718
719 utf *utf_new_char_classname(const char *text)
720 {
721         if (strchr(text, '.')) {
722                 char *txt = strdup(text);
723                 char *end = txt + strlen(txt);
724                 char *c;
725                 utf *tmpRes;
726
727                 for (c = txt; c < end; c++)
728                         if (*c == '.') *c = '/';
729
730                 tmpRes = utf_new(txt, strlen(txt));
731                 FREE(txt, 0);
732
733                 return tmpRes;
734
735         } else
736                 return utf_new(text, strlen(text));
737 }
738
739
740 /* utf_nextu2 ******************************************************************
741
742    Read the next unicode character from the utf string and increment
743    the utf-string pointer accordingly.
744
745 *******************************************************************************/
746
747 u2 utf_nextu2(char **utf_ptr)
748 {
749     /* uncompressed unicode character */
750     u2 unicode_char = 0;
751     /* current position in utf text */  
752     unsigned char *utf = (unsigned char *) (*utf_ptr);
753     /* bytes representing the unicode character */
754     unsigned char ch1, ch2, ch3;
755     /* number of bytes used to represent the unicode character */
756     int len = 0;
757         
758     switch ((ch1 = utf[0]) >> 4) {
759         default: /* 1 byte */
760                 (*utf_ptr)++;
761                 return (u2) ch1;
762         case 0xC: 
763         case 0xD: /* 2 bytes */
764                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
765                         unsigned char high = ch1 & 0x1F;
766                         unsigned char low  = ch2 & 0x3F;
767                         unicode_char = (high << 6) + low;
768                         len = 2;
769                 }
770                 break;
771
772         case 0xE: /* 2 or 3 bytes */
773                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
774                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
775                                 unsigned char low  = ch3 & 0x3f;
776                                 unsigned char mid  = ch2 & 0x3f;
777                                 unsigned char high = ch1 & 0x0f;
778                                 unicode_char = (((high << 6) + mid) << 6) + low;
779                                 len = 3;
780                         } else
781                                 len = 2;                                           
782                 }
783                 break;
784     }
785
786     /* update position in utf-text */
787     *utf_ptr = (char *) (utf + len);
788
789     return unicode_char;
790 }
791
792
793 /* utf_strlen ******************************************************************
794
795    Determine number of unicode characters in the utf string.
796
797 *******************************************************************************/
798
799 u4 utf_strlen(utf *u)
800 {
801         char *endpos;                       /* points behind utf string           */
802         char *utf_ptr;                      /* current position in utf text       */
803         u4 len = 0;                         /* number of unicode characters       */
804
805         if (!u) {
806                 exceptions_throw_nullpointerexception();
807                 return 0;
808         }
809
810         endpos = UTF_END(u);
811         utf_ptr = u->text;
812
813         while (utf_ptr < endpos) {
814                 len++;
815                 /* next unicode character */
816                 utf_nextu2(&utf_ptr);
817         }
818
819         if (utf_ptr != endpos)
820                 /* string ended abruptly */
821                 throw_cacao_exception_exit(string_java_lang_InternalError,
822                                                                    "Illegal utf8 string");
823
824         return len;
825 }
826
827
828 /* u2_utflength ****************************************************************
829
830    Returns the utf length in bytes of a u2 array.
831
832 *******************************************************************************/
833
834 u4 u2_utflength(u2 *text, u4 u2_length)
835 {
836         u4 result_len = 0;                  /* utf length in bytes                */
837         u2 ch;                              /* current unicode character          */
838         u4 len;
839         
840         for (len = 0; len < u2_length; len++) {
841                 /* next unicode character */
842                 ch = *text++;
843           
844                 /* determine bytes required to store unicode character as utf */
845                 if (ch && (ch < 0x80)) 
846                         result_len++;
847                 else if (ch < 0x800)
848                         result_len += 2;        
849                 else 
850                         result_len += 3;        
851         }
852
853     return result_len;
854 }
855
856
857 /* utf_display *****************************************************************
858
859    Write utf symbol to stdout (for debugging purposes).
860
861 *******************************************************************************/
862
863 void utf_display(utf *u)
864 {
865         char *endpos;                       /* points behind utf string           */
866         char *utf_ptr;                      /* current position in utf text       */
867
868         if (u == NULL) {
869                 printf("NULL");
870                 fflush(stdout);
871                 return;
872         }
873
874         endpos = UTF_END(u);
875         utf_ptr = u->text;
876
877         while (utf_ptr < endpos) {
878                 /* read next unicode character */
879
880                 u2 c = utf_nextu2(&utf_ptr);
881
882                 if ((c >= 32) && (c <= 127))
883                         printf("%c", c);
884                 else
885                         printf("?");
886         }
887
888         fflush(stdout);
889 }
890
891
892 /* utf_display_classname *******************************************************
893
894    Write utf symbol to stdout with `/' converted to `.' (for debugging
895    purposes).
896
897 *******************************************************************************/
898
899 void utf_display_classname(utf *u)
900 {
901         char *endpos;                       /* points behind utf string           */
902         char *utf_ptr;                      /* current position in utf text       */
903
904         if (u == NULL) {
905                 printf("NULL");
906                 fflush(stdout);
907                 return;
908         }
909
910         endpos = UTF_END(u);
911         utf_ptr = u->text;
912
913         while (utf_ptr < endpos) {
914                 /* read next unicode character */
915
916                 u2 c = utf_nextu2(&utf_ptr);
917
918                 if (c == '/')
919                         c = '.';
920
921                 if ((c >= 32) && (c <= 127))
922                         printf("%c", c);
923                 else
924                         printf("?");
925         }
926
927         fflush(stdout);
928 }
929
930
931 /* utf_sprint ******************************************************************
932         
933    Write utf symbol into c-string (for debugging purposes).
934
935 *******************************************************************************/
936
937 void utf_sprint(char *buffer, utf *u)
938 {
939         char *endpos;                       /* points behind utf string           */
940         char *utf_ptr;                      /* current position in utf text       */
941         u2 pos = 0;                         /* position in c-string               */
942
943         if (!u) {
944                 strcpy(buffer, "NULL");
945                 return;
946         }
947
948         endpos = UTF_END(u);
949         utf_ptr = u->text;
950
951         while (utf_ptr < endpos) 
952                 /* copy next unicode character */       
953                 buffer[pos++] = utf_nextu2(&utf_ptr);
954
955         /* terminate string */
956         buffer[pos] = '\0';
957 }
958
959
960 /* utf_sprint_classname ********************************************************
961         
962    Write utf symbol into c-string with `/' converted to `.' (for debugging
963    purposes).
964
965 *******************************************************************************/
966
967 void utf_sprint_classname(char *buffer, utf *u)
968 {
969         char *endpos;                       /* points behind utf string           */
970         char *utf_ptr;                      /* current position in utf text       */
971         u2 pos = 0;                         /* position in c-string               */
972
973         if (!u) {
974                 strcpy(buffer, "NULL");
975                 return;
976         }
977
978         endpos = UTF_END(u);
979         utf_ptr = u->text;
980
981         while (utf_ptr < endpos) {
982                 /* copy next unicode character */       
983                 u2 c = utf_nextu2(&utf_ptr);
984                 if (c == '/') c = '.';
985                 buffer[pos++] = c;
986         }
987
988         /* terminate string */
989         buffer[pos] = '\0';
990 }
991
992
993 /* utf_strcat ******************************************************************
994         
995    Like libc strcat, but uses an utf8 string.
996
997 *******************************************************************************/
998
999 void utf_strcat(char *buffer, utf *u)
1000 {
1001         utf_sprint(buffer + strlen(buffer), u);
1002 }
1003
1004
1005 /* utf_strcat_classname ********************************************************
1006         
1007    Like libc strcat, but uses an utf8 string.
1008
1009 *******************************************************************************/
1010
1011 void utf_strcat_classname(char *buffer, utf *u)
1012 {
1013         utf_sprint_classname(buffer + strlen(buffer), u);
1014 }
1015
1016
1017 /* utf_fprint ******************************************************************
1018         
1019    Write utf symbol into file.
1020
1021 *******************************************************************************/
1022
1023 void utf_fprint(FILE *file, utf *u)
1024 {
1025         char *endpos;                       /* points behind utf string           */
1026         char *utf_ptr;                      /* current position in utf text       */
1027
1028         if (!u)
1029                 return;
1030
1031         endpos = UTF_END(u);
1032         utf_ptr = u->text;
1033
1034         while (utf_ptr < endpos) { 
1035                 /* read next unicode character */                
1036                 u2 c = utf_nextu2(&utf_ptr);                            
1037
1038                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1039                 else fprintf(file, "?");
1040         }
1041 }
1042
1043
1044 /* utf_fprint_classname ********************************************************
1045         
1046    Write utf symbol into file with `/' converted to `.'.
1047
1048 *******************************************************************************/
1049
1050 void utf_fprint_classname(FILE *file, utf *u)
1051 {
1052         char *endpos;                       /* points behind utf string           */
1053         char *utf_ptr;                      /* current position in utf text       */
1054
1055     if (!u)
1056                 return;
1057
1058         endpos = UTF_END(u);
1059         utf_ptr = u->text;
1060
1061         while (utf_ptr < endpos) { 
1062                 /* read next unicode character */                
1063                 u2 c = utf_nextu2(&utf_ptr);                            
1064                 if (c == '/') c = '.';
1065
1066                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1067                 else fprintf(file, "?");
1068         }
1069 }
1070
1071
1072 /* is_valid_utf ****************************************************************
1073
1074    Return true if the given string is a valid UTF-8 string.
1075
1076    utf_ptr...points to first character
1077    end_pos...points after last character
1078
1079 *******************************************************************************/
1080
1081 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1082
1083 bool is_valid_utf(char *utf_ptr, char *end_pos)
1084 {
1085         int bytes;
1086         int len,i;
1087         char c;
1088         unsigned long v;
1089
1090         if (end_pos < utf_ptr) return false;
1091         bytes = end_pos - utf_ptr;
1092         while (bytes--) {
1093                 c = *utf_ptr++;
1094
1095                 if (!c) return false;                     /* 0x00 is not allowed */
1096                 if ((c & 0x80) == 0) continue;            /* ASCII */
1097
1098                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1099                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1100                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1101                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1102                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1103                 else return false;                        /* invalid leading byte */
1104
1105                 if (len > 2) return false;                /* Java limitation */
1106
1107                 v = (unsigned long)c & (0x3f >> len);
1108                 
1109                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1110
1111                 for (i = len; i--; ) {
1112                         c = *utf_ptr++;
1113                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1114                                 return false;
1115                         v = (v << 6) | (c & 0x3f);
1116                 }
1117
1118                 if (v == 0) {
1119                         if (len != 1) return false;           /* Java special */
1120
1121                 } else {
1122                         /* Sun Java seems to allow overlong UTF-8 encodings */
1123                         
1124                         /* if (v < min_codepoint[len]) */
1125                                 /* XXX throw exception? */
1126                 }
1127
1128                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1129                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1130
1131                 /* even these seem to be allowed */
1132                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1133         }
1134
1135         return true;
1136 }
1137
1138
1139 /* is_valid_name ***************************************************************
1140
1141    Return true if the given string may be used as a class/field/method
1142    name. (Currently this only disallows empty strings and control
1143    characters.)
1144
1145    NOTE: The string is assumed to have passed is_valid_utf!
1146
1147    utf_ptr...points to first character
1148    end_pos...points after last character
1149
1150 *******************************************************************************/
1151
1152 bool is_valid_name(char *utf_ptr, char *end_pos)
1153 {
1154         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1155
1156         while (utf_ptr < end_pos) {
1157                 unsigned char c = *utf_ptr++;
1158
1159                 if (c < 0x20) return false; /* disallow control characters */
1160                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1161                         return false;
1162         }
1163
1164         return true;
1165 }
1166
1167 bool is_valid_name_utf(utf *u)
1168 {
1169         return is_valid_name(u->text, UTF_END(u));
1170 }
1171
1172
1173 /* utf_show ********************************************************************
1174
1175    Writes the utf symbols in the utfhash to stdout and displays the
1176    number of external hash chains grouped according to the chainlength
1177    (for debugging purposes).
1178
1179 *******************************************************************************/
1180
1181 #if !defined(NDEBUG)
1182 void utf_show(void)
1183 {
1184
1185 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1186
1187         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1188         u4 max_chainlength = 0;      /* maximum length of the chains */
1189         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1190         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1191         u4 i;
1192
1193         printf("UTF-HASH:\n");
1194
1195         /* show element of utf-hashtable */
1196
1197         for (i = 0; i < hashtable_utf.size; i++) {
1198                 utf *u = hashtable_utf.ptr[i];
1199
1200                 if (u) {
1201                         printf("SLOT %d: ", (int) i);
1202
1203                         while (u) {
1204                                 printf("'");
1205                                 utf_display(u);
1206                                 printf("' ");
1207                                 u = u->hashlink;
1208                         }       
1209                         printf("\n");
1210                 }
1211         }
1212
1213         printf("UTF-HASH: %d slots for %d entries\n", 
1214                    (int) hashtable_utf.size, (int) hashtable_utf.entries );
1215
1216         if (hashtable_utf.entries == 0)
1217                 return;
1218
1219         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1220
1221         for (i=0;i<CHAIN_LIMIT;i++)
1222                 chain_count[i]=0;
1223
1224         /* count numbers of hashchains according to their length */
1225         for (i=0; i<hashtable_utf.size; i++) {
1226                   
1227                 utf *u = (utf*) hashtable_utf.ptr[i];
1228                 u4 chain_length = 0;
1229
1230                 /* determine chainlength */
1231                 while (u) {
1232                         u = u->hashlink;
1233                         chain_length++;
1234                 }
1235
1236                 /* update sum of all chainlengths */
1237                 sum_chainlength+=chain_length;
1238
1239                 /* determine the maximum length of the chains */
1240                 if (chain_length>max_chainlength)
1241                         max_chainlength = chain_length;
1242
1243                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1244                 if (chain_length>=CHAIN_LIMIT) {
1245                         beyond_limit+=chain_length;
1246                         chain_length=CHAIN_LIMIT-1;
1247                 }
1248
1249                 /* update number of hashchains of current length */
1250                 chain_count[chain_length]++;
1251         }
1252
1253         /* display results */  
1254         for (i=1;i<CHAIN_LIMIT-1;i++) 
1255                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf.entries));
1256           
1257         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf.entries);
1258
1259
1260         printf("max. chainlength:%5d\n",max_chainlength);
1261
1262         /* avg. chainlength = sum of chainlengths / number of chains */
1263         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf.size-chain_count[0]));
1264 }
1265 #endif /* !defined(NDEBUG) */
1266
1267
1268 /*
1269  * These are local overrides for various environment variables in Emacs.
1270  * Please do not remove this and leave it at the end of the file, where
1271  * Emacs will automagically detect them.
1272  * ---------------------------------------------------------------------
1273  * Local variables:
1274  * mode: c
1275  * indent-tabs-mode: t
1276  * c-basic-offset: 4
1277  * tab-width: 4
1278  * End:
1279  */