* src/vm/jit/x86_64/md.c (md_signal_handler_sigusr2): Fixed comment.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32                         Edwin Steiner
33
34    $Id: utf8.c 5123 2006-07-12 21:45:34Z twisti $
35
36 */
37
38
39 #include "config.h"
40
41 #include <string.h>
42 #include <assert.h>
43
44 #include "vm/types.h"
45
46 #include "mm/memory.h"
47
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/lock.h"
50 #else
51 # include "threads/none/lock.h"
52 #endif
53
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
60 #include "vm/utf8.h"
61
62
63 /* global variables ***********************************************************/
64
65 /* hashsize must be power of 2 */
66
67 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
68
69 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
70
71
72 /* utf-symbols for pointer comparison of frequently used strings **************/
73
74 utf *utf_java_lang_Object;
75
76 utf *utf_java_lang_Class;
77 utf *utf_java_lang_ClassLoader;
78 utf *utf_java_lang_Cloneable;
79 utf *utf_java_lang_SecurityManager;
80 utf *utf_java_lang_String;
81 utf *utf_java_lang_System;
82 utf *utf_java_lang_ThreadGroup;
83 utf *utf_java_io_Serializable;
84
85 utf *utf_java_lang_Throwable;
86 utf *utf_java_lang_VMThrowable;
87 utf *utf_java_lang_Error;
88 utf *utf_java_lang_AbstractMethodError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchMethodError;
92 utf *utf_java_lang_OutOfMemoryError;
93
94 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ClassCastException;
96 utf *utf_java_lang_ClassNotFoundException;
97 utf *utf_java_lang_IllegalArgumentException;
98 utf *utf_java_lang_IllegalMonitorStateException;
99
100 utf *utf_java_lang_NullPointerException;
101
102 utf* utf_java_lang_Void;
103 utf* utf_java_lang_Boolean;
104 utf* utf_java_lang_Byte;
105 utf* utf_java_lang_Character;
106 utf* utf_java_lang_Short;
107 utf* utf_java_lang_Integer;
108 utf* utf_java_lang_Long;
109 utf* utf_java_lang_Float;
110 utf* utf_java_lang_Double;
111
112 utf *utf_java_lang_StackTraceElement;
113 utf *utf_java_lang_reflect_Constructor;
114 utf *utf_java_lang_reflect_Field;
115 utf *utf_java_lang_reflect_Method;
116 utf *utf_java_util_Vector;
117
118 utf *utf_InnerClasses;                  /* InnerClasses                       */
119 utf *utf_ConstantValue;                 /* ConstantValue                      */
120 utf *utf_Code;                          /* Code                               */
121 utf *utf_Exceptions;                    /* Exceptions                         */
122 utf *utf_LineNumberTable;               /* LineNumberTable                    */
123 utf *utf_SourceFile;                    /* SourceFile                         */
124
125 utf *utf_init;                          /* <init>                             */
126 utf *utf_clinit;                        /* <clinit>                           */
127 utf *utf_clone;                         /* clone                              */
128 utf *utf_finalize;                      /* finalize                           */
129 utf *utf_run;                           /* run                                */
130
131 utf *utf_add;                           /* add                                */
132 utf *utf_remove;                        /* remove                             */
133 utf *utf_put;                           /* put                                */
134 utf *utf_get;                           /* get                                */
135 utf *utf_value;                         /* value                              */
136
137 utf *utf_fillInStackTrace;
138 utf *utf_getSystemClassLoader;
139 utf *utf_loadClass;
140 utf *utf_printStackTrace;
141
142 utf *utf_Z;                             /* Z                                  */
143 utf *utf_B;                             /* B                                  */
144 utf *utf_C;                             /* C                                  */
145 utf *utf_S;                             /* S                                  */
146 utf *utf_I;                             /* I                                  */
147 utf *utf_J;                             /* J                                  */
148 utf *utf_F;                             /* F                                  */
149 utf *utf_D;                             /* D                                  */
150
151 utf *utf_void__void;                    /* ()V                                */
152 utf *utf_boolean__void;                 /* (Z)V                               */
153 utf *utf_byte__void;                    /* (B)V                               */
154 utf *utf_char__void;                    /* (C)V                               */
155 utf *utf_short__void;                   /* (S)V                               */
156 utf *utf_int__void;                     /* (I)V                               */
157 utf *utf_long__void;                    /* (J)V                               */
158 utf *utf_float__void;                   /* (F)V                               */
159 utf *utf_double__void;                  /* (D)V                               */
160
161 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
162 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
163 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
164 utf *utf_java_lang_Object__java_lang_Object;
165 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
166 utf *utf_java_lang_String__java_lang_Class;
167 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
168
169 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
170 utf *utf_null;
171 utf *array_packagename;
172
173
174 /* utf_init ********************************************************************
175
176    Initializes the utf8 subsystem.
177
178 *******************************************************************************/
179
180 bool utf8_init(void)
181 {
182         /* create utf8 hashtable */
183
184         hashtable_utf = NEW(hashtable);
185
186         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
187
188 #if defined(ENABLE_STATISTICS)
189         if (opt_stat)
190                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
191 #endif
192
193         /* create utf-symbols for pointer comparison of frequently used strings */
194
195         utf_java_lang_Object           = utf_new_char("java/lang/Object");
196
197         utf_java_lang_Class            = utf_new_char("java/lang/Class");
198         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
199         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
200         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
201         utf_java_lang_String           = utf_new_char("java/lang/String");
202         utf_java_lang_System           = utf_new_char("java/lang/System");
203         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
204         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
205
206         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
207         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
208         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
209
210         utf_java_lang_AbstractMethodError =
211                 utf_new_char(string_java_lang_AbstractMethodError);
212
213         utf_java_lang_LinkageError =
214                 utf_new_char(string_java_lang_LinkageError);
215
216         utf_java_lang_NoClassDefFoundError =
217                 utf_new_char(string_java_lang_NoClassDefFoundError);
218
219         utf_java_lang_NoSuchMethodError =
220                 utf_new_char(string_java_lang_NoSuchMethodError);
221
222         utf_java_lang_OutOfMemoryError =
223                 utf_new_char(string_java_lang_OutOfMemoryError);
224
225         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
226
227         utf_java_lang_ClassCastException =
228                 utf_new_char(string_java_lang_ClassCastException);
229
230         utf_java_lang_ClassNotFoundException =
231                 utf_new_char(string_java_lang_ClassNotFoundException);
232
233         utf_java_lang_IllegalArgumentException =
234                 utf_new_char(string_java_lang_IllegalArgumentException);
235
236         utf_java_lang_IllegalMonitorStateException =
237                 utf_new_char(string_java_lang_IllegalMonitorStateException);
238
239         utf_java_lang_NullPointerException =
240                 utf_new_char(string_java_lang_NullPointerException);
241
242         utf_java_lang_Void             = utf_new_char("java/lang/Void");
243         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
244         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
245         utf_java_lang_Character        = utf_new_char("java/lang/Character");
246         utf_java_lang_Short            = utf_new_char("java/lang/Short");
247         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
248         utf_java_lang_Long             = utf_new_char("java/lang/Long");
249         utf_java_lang_Float            = utf_new_char("java/lang/Float");
250         utf_java_lang_Double           = utf_new_char("java/lang/Double");
251
252         utf_java_lang_StackTraceElement =
253                 utf_new_char("java/lang/StackTraceElement");
254
255         utf_java_lang_reflect_Constructor =
256                 utf_new_char("java/lang/reflect/Constructor");
257
258         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
259         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
260         utf_java_util_Vector           = utf_new_char("java/util/Vector");
261
262         utf_InnerClasses               = utf_new_char("InnerClasses");
263         utf_ConstantValue              = utf_new_char("ConstantValue");
264         utf_Code                       = utf_new_char("Code");
265         utf_Exceptions                 = utf_new_char("Exceptions");
266         utf_LineNumberTable            = utf_new_char("LineNumberTable");
267         utf_SourceFile                 = utf_new_char("SourceFile");
268
269         utf_init                           = utf_new_char("<init>");
270         utf_clinit                         = utf_new_char("<clinit>");
271         utf_clone                      = utf_new_char("clone");
272         utf_finalize                   = utf_new_char("finalize");
273         utf_run                        = utf_new_char("run");
274
275         utf_add                        = utf_new_char("add");
276         utf_remove                     = utf_new_char("remove");
277         utf_put                        = utf_new_char("put");
278         utf_get                        = utf_new_char("get");
279         utf_value                      = utf_new_char("value");
280
281         utf_printStackTrace            = utf_new_char("printStackTrace");
282         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
283         utf_loadClass                  = utf_new_char("loadClass");
284         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
285
286         utf_Z                          = utf_new_char("Z");
287         utf_B                          = utf_new_char("B");
288         utf_C                          = utf_new_char("C");
289         utf_S                          = utf_new_char("S");
290         utf_I                          = utf_new_char("I");
291         utf_J                          = utf_new_char("J");
292         utf_F                          = utf_new_char("F");
293         utf_D                          = utf_new_char("D");
294
295         utf_void__void                 = utf_new_char("()V");
296         utf_boolean__void              = utf_new_char("(Z)V");
297         utf_byte__void                 = utf_new_char("(B)V");
298         utf_char__void                 = utf_new_char("(C)V");
299         utf_short__void                = utf_new_char("(S)V");
300         utf_int__void                  = utf_new_char("(I)V");
301         utf_long__void                 = utf_new_char("(J)V");
302         utf_float__void                = utf_new_char("(F)V");
303         utf_double__void               = utf_new_char("(D)V");
304         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
305         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
306
307         utf_void__java_lang_ClassLoader =
308                 utf_new_char("()Ljava/lang/ClassLoader;");
309
310         utf_java_lang_Object__java_lang_Object =
311                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
312
313         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
314
315         utf_java_lang_String__java_lang_Class =
316                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
317
318         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
319
320         utf_null                       = utf_new_char("null");
321         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
322         array_packagename              = utf_new_char("\t<the array package>");
323
324         /* everything's ok */
325
326         return true;
327 }
328
329
330 /* utf_hashkey *****************************************************************
331
332    The hashkey is computed from the utf-text by using up to 8
333    characters.  For utf-symbols longer than 15 characters 3 characters
334    are taken from the beginning and the end, 2 characters are taken
335    from the middle.
336
337 *******************************************************************************/
338
339 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
340 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
341
342 u4 utf_hashkey(const char *text, u4 length)
343 {
344         const char *start_pos = text;       /* pointer to utf text                */
345         u4 a;
346
347         switch (length) {
348         case 0: /* empty string */
349                 return 0;
350
351         case 1: return fbs(0);
352         case 2: return fbs(0) ^ nbs(3);
353         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
354         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
355         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
356         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
357         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
358         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
359
360         case 9:
361                 a = fbs(0);
362                 a ^= nbs(1);
363                 a ^= nbs(2);
364                 text++;
365                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
366
367         case 10:
368                 a = fbs(0);
369                 text++;
370                 a ^= nbs(2);
371                 a ^= nbs(3);
372                 a ^= nbs(4);
373                 text++;
374                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
375
376         case 11:
377                 a = fbs(0);
378                 text++;
379                 a ^= nbs(2);
380                 a ^= nbs(3);
381                 a ^= nbs(4);
382                 text++;
383                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
384
385         case 12:
386                 a = fbs(0);
387                 text += 2;
388                 a ^= nbs(2);
389                 a ^= nbs(3);
390                 text++;
391                 a ^= nbs(5);
392                 a ^= nbs(6);
393                 a ^= nbs(7);
394                 text++;
395                 return a ^ nbs(9) ^ nbs(10);
396
397         case 13:
398                 a = fbs(0);
399                 a ^= nbs(1);
400                 text++;
401                 a ^= nbs(3);
402                 a ^= nbs(4);
403                 text += 2;      
404                 a ^= nbs(7);
405                 a ^= nbs(8);
406                 text += 2;
407                 return a ^ nbs(9) ^ nbs(10);
408
409         case 14:
410                 a = fbs(0);
411                 text += 2;      
412                 a ^= nbs(3);
413                 a ^= nbs(4);
414                 text += 2;      
415                 a ^= nbs(7);
416                 a ^= nbs(8);
417                 text += 2;
418                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
419
420         case 15:
421                 a = fbs(0);
422                 text += 2;      
423                 a ^= nbs(3);
424                 a ^= nbs(4);
425                 text += 2;      
426                 a ^= nbs(7);
427                 a ^= nbs(8);
428                 text += 2;
429                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
430
431         default:  /* 3 characters from beginning */
432                 a = fbs(0);
433                 text += 2;
434                 a ^= nbs(3);
435                 a ^= nbs(4);
436
437                 /* 2 characters from middle */
438                 text = start_pos + (length / 2);
439                 a ^= fbs(5);
440                 text += 2;
441                 a ^= nbs(6);    
442
443                 /* 3 characters from end */
444                 text = start_pos + length - 4;
445
446                 a ^= fbs(7);
447                 text++;
448
449                 return a ^ nbs(10) ^ nbs(11);
450     }
451 }
452
453 /* utf_full_hashkey ************************************************************
454
455    This function computes a hash value using all bytes in the string.
456
457    The algorithm is the "One-at-a-time" algorithm as published
458    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
459
460 *******************************************************************************/
461
462 u4 utf_full_hashkey(const char *text, u4 length)
463 {
464         register const unsigned char *p = (const unsigned char *) text;
465         register u4 hash;
466         register u4 i;
467
468         hash = 0;
469         for (i=length; i--;)
470         {
471             hash += *p++;
472             hash += (hash << 10);
473             hash ^= (hash >> 6);
474         }
475         hash += (hash << 3);
476         hash ^= (hash >> 11);
477         hash += (hash << 15);
478
479         return hash;
480 }
481
482 /* unicode_hashkey *************************************************************
483
484    Compute the hashkey of a unicode string.
485
486 *******************************************************************************/
487
488 u4 unicode_hashkey(u2 *text, u2 len)
489 {
490         return utf_hashkey((char *) text, len);
491 }
492
493
494 /* utf_new *********************************************************************
495
496    Creates a new utf-symbol, the text of the symbol is passed as a
497    u1-array. The function searches the utf-hashtable for a utf-symbol
498    with this text. On success the element returned, otherwise a new
499    hashtable element is created.
500
501    If the number of entries in the hashtable exceeds twice the size of
502    the hashtable slots a reorganization of the hashtable is done and
503    the utf symbols are copied to a new hashtable with doubled size.
504
505 *******************************************************************************/
506
507 utf *utf_new(const char *text, u2 length)
508 {
509         u4 key;                             /* hashkey computed from utf-text     */
510         u4 slot;                            /* slot in hashtable                  */
511         utf *u;                             /* hashtable element                  */
512         u2 i;
513
514         LOCK_MONITOR_ENTER(hashtable_utf->header);
515
516 #if defined(ENABLE_STATISTICS)
517         if (opt_stat)
518                 count_utf_new++;
519 #endif
520
521         key  = utf_hashkey(text, length);
522         slot = key & (hashtable_utf->size - 1);
523         u    = hashtable_utf->ptr[slot];
524
525         /* search external hash chain for utf-symbol */
526
527         while (u) {
528                 if (u->blength == length) {
529                         /* compare text of hashtable elements */
530
531                         for (i = 0; i < length; i++)
532                                 if (text[i] != u->text[i])
533                                         goto nomatch;
534                         
535 #if defined(ENABLE_STATISTICS)
536                         if (opt_stat)
537                                 count_utf_new_found++;
538 #endif
539
540                         /* symbol found in hashtable */
541
542                         LOCK_MONITOR_EXIT(hashtable_utf->header);
543
544                         return u;
545                 }
546
547         nomatch:
548                 u = u->hashlink; /* next element in external chain */
549         }
550
551 #if defined(ENABLE_STATISTICS)
552         if (opt_stat)
553                 count_utf_len += sizeof(utf) + length + 1;
554 #endif
555
556         /* location in hashtable found, create new utf element */
557         u = NEW(utf);
558         u->blength  = length;               /* length in bytes of utfstring       */
559         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
560         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
561
562         memcpy(u->text, text, length);      /* copy utf-text                      */
563         u->text[length] = '\0';
564
565         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
566         hashtable_utf->entries++;           /* update number of entries           */
567
568         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
569
570         /* reorganization of hashtable, average length of the external
571            chains is approx. 2 */
572
573                 hashtable *newhash;                              /* the new hashtable */
574                 u4         i;
575                 utf       *u;
576                 utf       *nextu;
577                 u4         slot;
578
579                 /* create new hashtable, double the size */
580
581                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
582
583 #if defined(ENABLE_STATISTICS)
584                 if (opt_stat)
585                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
586 #endif
587
588                 /* transfer elements to new hashtable */
589
590                 for (i = 0; i < hashtable_utf->size; i++) {
591                         u = hashtable_utf->ptr[i];
592
593                         while (u) {
594                                 nextu = u->hashlink;
595                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
596                                                 
597                                 u->hashlink = (utf *) newhash->ptr[slot];
598                                 newhash->ptr[slot] = u;
599
600                                 /* follow link in external hash chain */
601
602                                 u = nextu;
603                         }
604                 }
605         
606                 /* dispose old table */
607
608                 hashtable_free(hashtable_utf);
609
610                 hashtable_utf = newhash;
611         }
612
613         LOCK_MONITOR_EXIT(hashtable_utf->header);
614
615         return u;
616 }
617
618
619 /* utf_new_u2 ******************************************************************
620
621    Make utf symbol from u2 array, if isclassname is true '.' is
622    replaced by '/'.
623
624 *******************************************************************************/
625
626 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
627 {
628         char *buffer;                   /* memory buffer for  unicode characters  */
629         char *pos;                      /* pointer to current position in buffer  */
630         u4 left;                        /* unicode characters left                */
631         u4 buflength;                   /* utf length in bytes of the u2 array    */
632         utf *result;                    /* resulting utf-string                   */
633         int i;          
634
635         /* determine utf length in bytes and allocate memory */
636
637         buflength = u2_utflength(unicode_pos, unicode_length); 
638         buffer    = MNEW(char, buflength);
639  
640         left = buflength;
641         pos  = buffer;
642
643         for (i = 0; i++ < unicode_length; unicode_pos++) {
644                 /* next unicode character */
645                 u2 c = *unicode_pos;
646                 
647                 if ((c != 0) && (c < 0x80)) {
648                         /* 1 character */       
649                         left--;
650                 if ((int) left < 0) break;
651                         /* convert classname */
652                         if (isclassname && c == '.')
653                                 *pos++ = '/';
654                         else
655                                 *pos++ = (char) c;
656
657                 } else if (c < 0x800) {             
658                         /* 2 characters */                              
659                 unsigned char high = c >> 6;
660                 unsigned char low  = c & 0x3F;
661                         left = left - 2;
662                 if ((int) left < 0) break;
663                 *pos++ = high | 0xC0; 
664                 *pos++ = low  | 0x80;     
665
666                 } else {         
667                 /* 3 characters */                              
668                 char low  = c & 0x3f;
669                 char mid  = (c >> 6) & 0x3F;
670                 char high = c >> 12;
671                         left = left - 3;
672                 if ((int) left < 0) break;
673                 *pos++ = high | 0xE0; 
674                 *pos++ = mid  | 0x80;  
675                 *pos++ = low  | 0x80;   
676                 }
677         }
678         
679         /* insert utf-string into symbol-table */
680         result = utf_new(buffer,buflength);
681
682         MFREE(buffer, char, buflength);
683
684         return result;
685 }
686
687
688 /* utf_new_char ****************************************************************
689
690    Creates a new utf symbol, the text for this symbol is passed as a
691    c-string ( = char* ).
692
693 *******************************************************************************/
694
695 utf *utf_new_char(const char *text)
696 {
697         return utf_new(text, strlen(text));
698 }
699
700
701 /* utf_new_char_classname ******************************************************
702
703    Creates a new utf symbol, the text for this symbol is passed as a
704    c-string ( = char* ) "." characters are going to be replaced by
705    "/". Since the above function is used often, this is a separte
706    function, instead of an if.
707
708 *******************************************************************************/
709
710 utf *utf_new_char_classname(const char *text)
711 {
712         if (strchr(text, '.')) {
713                 char *txt = strdup(text);
714                 char *end = txt + strlen(txt);
715                 char *c;
716                 utf *tmpRes;
717
718                 for (c = txt; c < end; c++)
719                         if (*c == '.') *c = '/';
720
721                 tmpRes = utf_new(txt, strlen(txt));
722                 FREE(txt, 0);
723
724                 return tmpRes;
725
726         } else
727                 return utf_new(text, strlen(text));
728 }
729
730
731 /* utf_nextu2 ******************************************************************
732
733    Read the next unicode character from the utf string and increment
734    the utf-string pointer accordingly.
735
736 *******************************************************************************/
737
738 u2 utf_nextu2(char **utf_ptr)
739 {
740     /* uncompressed unicode character */
741     u2 unicode_char = 0;
742     /* current position in utf text */  
743     unsigned char *utf = (unsigned char *) (*utf_ptr);
744     /* bytes representing the unicode character */
745     unsigned char ch1, ch2, ch3;
746     /* number of bytes used to represent the unicode character */
747     int len = 0;
748         
749     switch ((ch1 = utf[0]) >> 4) {
750         default: /* 1 byte */
751                 (*utf_ptr)++;
752                 return (u2) ch1;
753         case 0xC: 
754         case 0xD: /* 2 bytes */
755                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
756                         unsigned char high = ch1 & 0x1F;
757                         unsigned char low  = ch2 & 0x3F;
758                         unicode_char = (high << 6) + low;
759                         len = 2;
760                 }
761                 break;
762
763         case 0xE: /* 2 or 3 bytes */
764                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
765                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
766                                 unsigned char low  = ch3 & 0x3f;
767                                 unsigned char mid  = ch2 & 0x3f;
768                                 unsigned char high = ch1 & 0x0f;
769                                 unicode_char = (((high << 6) + mid) << 6) + low;
770                                 len = 3;
771                         } else
772                                 len = 2;                                           
773                 }
774                 break;
775     }
776
777     /* update position in utf-text */
778     *utf_ptr = (char *) (utf + len);
779
780     return unicode_char;
781 }
782
783
784 /* utf_bytes *******************************************************************
785
786    Determine number of bytes (aka. octets) in the utf string.
787
788    IN:
789       u............utf string
790
791    OUT:
792       The number of octets of this utf string.
793           There is _no_ terminating zero included in this count.
794
795 *******************************************************************************/
796
797 u4 utf_bytes(utf *u)
798 {
799         return u->blength;
800 }
801
802 /* utf_get_number_of_u2s_for_buffer ********************************************
803
804    Determine number of UTF-16 u2s in the given UTF-8 buffer
805
806    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
807    to an array of u2s (UTF-16) and want to know how many of them you will get.
808    All other uses of this function are probably wrong.
809
810    IN:
811       buffer........points to first char in buffer
812           blength.......number of _bytes_ in the buffer
813
814    OUT:
815       the number of u2s needed to hold this string in UTF-16 encoding.
816           There is _no_ terminating zero included in this count.
817
818    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
819    exception.
820
821 *******************************************************************************/
822
823 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
824 {
825         const char *endpos;                 /* points behind utf string           */
826         const char *utf_ptr;                /* current position in utf text       */
827         u4 len = 0;                         /* number of unicode characters       */
828
829         utf_ptr = buffer;
830         endpos = utf_ptr + blength;
831
832         while (utf_ptr < endpos) {
833                 len++;
834                 /* next unicode character */
835                 utf_nextu2((char **)&utf_ptr);
836         }
837
838         assert(utf_ptr == endpos);
839
840         return len;
841 }
842
843
844 /* utf_get_number_of_u2s *******************************************************
845
846    Determine number of UTF-16 u2s in the utf string.
847
848    CAUTION: Use this function *only* when you want to convert a utf string
849    to an array of u2s and want to know how many of them you will get.
850    All other uses of this function are probably wrong.
851
852    IN:
853       u............utf string
854
855    OUT:
856       the number of u2s needed to hold this string in UTF-16 encoding.
857           There is _no_ terminating zero included in this count.
858           XXX 0 if a NullPointerException has been thrown (see below)
859
860 *******************************************************************************/
861
862 u4 utf_get_number_of_u2s(utf *u)
863 {
864         char *endpos;                       /* points behind utf string           */
865         char *utf_ptr;                      /* current position in utf text       */
866         u4 len = 0;                         /* number of unicode characters       */
867
868         /* XXX this is probably not checked by most callers! Review this after */
869         /* the invalid uses of this function have been eliminated */
870         if (!u) {
871                 exceptions_throw_nullpointerexception();
872                 return 0;
873         }
874
875         endpos = UTF_END(u);
876         utf_ptr = u->text;
877
878         while (utf_ptr < endpos) {
879                 len++;
880                 /* next unicode character */
881                 utf_nextu2(&utf_ptr);
882         }
883
884         if (utf_ptr != endpos)
885                 /* string ended abruptly */
886                 throw_cacao_exception_exit(string_java_lang_InternalError,
887                                                                    "Illegal utf8 string");
888
889         return len;
890 }
891
892
893 /* u2_utflength ****************************************************************
894
895    Returns the utf length in bytes of a u2 array.
896
897 *******************************************************************************/
898
899 u4 u2_utflength(u2 *text, u4 u2_length)
900 {
901         u4 result_len = 0;                  /* utf length in bytes                */
902         u2 ch;                              /* current unicode character          */
903         u4 len;
904         
905         for (len = 0; len < u2_length; len++) {
906                 /* next unicode character */
907                 ch = *text++;
908           
909                 /* determine bytes required to store unicode character as utf */
910                 if (ch && (ch < 0x80)) 
911                         result_len++;
912                 else if (ch < 0x800)
913                         result_len += 2;        
914                 else 
915                         result_len += 3;        
916         }
917
918     return result_len;
919 }
920
921
922 /* utf_copy ********************************************************************
923
924    Copy the given utf string byte-for-byte to a buffer.
925
926    IN:
927       buffer.......the buffer
928           u............the utf string
929
930 *******************************************************************************/
931
932 void utf_copy(char *buffer, utf *u)
933 {
934         /* our utf strings are zero-terminated (done by utf_new) */
935         MCOPY(buffer, u->text, char, u->blength + 1);
936 }
937
938
939 /* utf_cat *********************************************************************
940
941    Append the given utf string byte-for-byte to a buffer.
942
943    IN:
944       buffer.......the buffer
945           u............the utf string
946
947 *******************************************************************************/
948
949 void utf_cat(char *buffer, utf *u)
950 {
951         /* our utf strings are zero-terminated (done by utf_new) */
952         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
953 }
954
955
956 /* utf_copy_classname **********************************************************
957
958    Copy the given utf classname byte-for-byte to a buffer.
959    '/' is replaced by '.'
960
961    IN:
962       buffer.......the buffer
963           u............the utf string
964
965 *******************************************************************************/
966
967 void utf_copy_classname(char *buffer, utf *u)
968 {
969         char *bufptr;
970         char *srcptr;
971         char *endptr;
972         char ch;
973
974         bufptr = buffer;
975         srcptr = u->text;
976         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
977
978         while (srcptr != endptr) {
979                 ch = *srcptr++;
980                 if (ch == '/')
981                         ch = '.';
982                 *bufptr++ = ch;
983         }
984 }
985
986
987 /* utf_cat *********************************************************************
988
989    Append the given utf classname byte-for-byte to a buffer.
990    '/' is replaced by '.'
991
992    IN:
993       buffer.......the buffer
994           u............the utf string
995
996 *******************************************************************************/
997
998 void utf_cat_classname(char *buffer, utf *u)
999 {
1000         utf_copy_classname(buffer + strlen(buffer), u);
1001 }
1002
1003 /* utf_display_printable_ascii *************************************************
1004
1005    Write utf symbol to stdout (for debugging purposes).
1006    Non-printable and non-ASCII characters are printed as '?'.
1007
1008 *******************************************************************************/
1009
1010 void utf_display_printable_ascii(utf *u)
1011 {
1012         char *endpos;                       /* points behind utf string           */
1013         char *utf_ptr;                      /* current position in utf text       */
1014
1015         if (u == NULL) {
1016                 printf("NULL");
1017                 fflush(stdout);
1018                 return;
1019         }
1020
1021         endpos = UTF_END(u);
1022         utf_ptr = u->text;
1023
1024         while (utf_ptr < endpos) {
1025                 /* read next unicode character */
1026
1027                 u2 c = utf_nextu2(&utf_ptr);
1028
1029                 if ((c >= 32) && (c <= 127))
1030                         printf("%c", c);
1031                 else
1032                         printf("?");
1033         }
1034
1035         fflush(stdout);
1036 }
1037
1038
1039 /* utf_display_printable_ascii_classname ***************************************
1040
1041    Write utf symbol to stdout with `/' converted to `.' (for debugging
1042    purposes).
1043    Non-printable and non-ASCII characters are printed as '?'.
1044
1045 *******************************************************************************/
1046
1047 void utf_display_printable_ascii_classname(utf *u)
1048 {
1049         char *endpos;                       /* points behind utf string           */
1050         char *utf_ptr;                      /* current position in utf text       */
1051
1052         if (u == NULL) {
1053                 printf("NULL");
1054                 fflush(stdout);
1055                 return;
1056         }
1057
1058         endpos = UTF_END(u);
1059         utf_ptr = u->text;
1060
1061         while (utf_ptr < endpos) {
1062                 /* read next unicode character */
1063
1064                 u2 c = utf_nextu2(&utf_ptr);
1065
1066                 if (c == '/')
1067                         c = '.';
1068
1069                 if ((c >= 32) && (c <= 127))
1070                         printf("%c", c);
1071                 else
1072                         printf("?");
1073         }
1074
1075         fflush(stdout);
1076 }
1077
1078
1079 /* utf_sprint_convert_to_latin1 ************************************************
1080         
1081    Write utf symbol into c-string (for debugging purposes).
1082    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1083    invalid results.
1084
1085 *******************************************************************************/
1086
1087 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1088 {
1089         char *endpos;                       /* points behind utf string           */
1090         char *utf_ptr;                      /* current position in utf text       */
1091         u2 pos = 0;                         /* position in c-string               */
1092
1093         if (!u) {
1094                 strcpy(buffer, "NULL");
1095                 return;
1096         }
1097
1098         endpos = UTF_END(u);
1099         utf_ptr = u->text;
1100
1101         while (utf_ptr < endpos) 
1102                 /* copy next unicode character */       
1103                 buffer[pos++] = utf_nextu2(&utf_ptr);
1104
1105         /* terminate string */
1106         buffer[pos] = '\0';
1107 }
1108
1109
1110 /* utf_sprint_convert_to_latin1_classname **************************************
1111         
1112    Write utf symbol into c-string with `/' converted to `.' (for debugging
1113    purposes).
1114    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1115    invalid results.
1116
1117 *******************************************************************************/
1118
1119 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1120 {
1121         char *endpos;                       /* points behind utf string           */
1122         char *utf_ptr;                      /* current position in utf text       */
1123         u2 pos = 0;                         /* position in c-string               */
1124
1125         if (!u) {
1126                 strcpy(buffer, "NULL");
1127                 return;
1128         }
1129
1130         endpos = UTF_END(u);
1131         utf_ptr = u->text;
1132
1133         while (utf_ptr < endpos) {
1134                 /* copy next unicode character */       
1135                 u2 c = utf_nextu2(&utf_ptr);
1136                 if (c == '/') c = '.';
1137                 buffer[pos++] = c;
1138         }
1139
1140         /* terminate string */
1141         buffer[pos] = '\0';
1142 }
1143
1144
1145 /* utf_strcat_convert_to_latin1 ************************************************
1146         
1147    Like libc strcat, but uses an utf8 string.
1148    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1149    invalid results.
1150
1151 *******************************************************************************/
1152
1153 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1154 {
1155         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1156 }
1157
1158
1159 /* utf_strcat_convert_to_latin1_classname **************************************
1160         
1161    Like libc strcat, but uses an utf8 string.
1162    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1163    invalid results.
1164
1165 *******************************************************************************/
1166
1167 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1168 {
1169         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1170 }
1171
1172
1173 /* utf_fprint_printable_ascii **************************************************
1174         
1175    Write utf symbol into file.
1176    Non-printable and non-ASCII characters are printed as '?'.
1177
1178 *******************************************************************************/
1179
1180 void utf_fprint_printable_ascii(FILE *file, utf *u)
1181 {
1182         char *endpos;                       /* points behind utf string           */
1183         char *utf_ptr;                      /* current position in utf text       */
1184
1185         if (!u)
1186                 return;
1187
1188         endpos = UTF_END(u);
1189         utf_ptr = u->text;
1190
1191         while (utf_ptr < endpos) { 
1192                 /* read next unicode character */                
1193                 u2 c = utf_nextu2(&utf_ptr);                            
1194
1195                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1196                 else fprintf(file, "?");
1197         }
1198 }
1199
1200
1201 /* utf_fprint_printable_ascii_classname ****************************************
1202         
1203    Write utf symbol into file with `/' converted to `.'.
1204    Non-printable and non-ASCII characters are printed as '?'.
1205
1206 *******************************************************************************/
1207
1208 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1209 {
1210         char *endpos;                       /* points behind utf string           */
1211         char *utf_ptr;                      /* current position in utf text       */
1212
1213     if (!u)
1214                 return;
1215
1216         endpos = UTF_END(u);
1217         utf_ptr = u->text;
1218
1219         while (utf_ptr < endpos) { 
1220                 /* read next unicode character */                
1221                 u2 c = utf_nextu2(&utf_ptr);                            
1222                 if (c == '/') c = '.';
1223
1224                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1225                 else fprintf(file, "?");
1226         }
1227 }
1228
1229
1230 /* is_valid_utf ****************************************************************
1231
1232    Return true if the given string is a valid UTF-8 string.
1233
1234    utf_ptr...points to first character
1235    end_pos...points after last character
1236
1237 *******************************************************************************/
1238
1239 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1240
1241 bool is_valid_utf(char *utf_ptr, char *end_pos)
1242 {
1243         int bytes;
1244         int len,i;
1245         char c;
1246         unsigned long v;
1247
1248         if (end_pos < utf_ptr) return false;
1249         bytes = end_pos - utf_ptr;
1250         while (bytes--) {
1251                 c = *utf_ptr++;
1252
1253                 if (!c) return false;                     /* 0x00 is not allowed */
1254                 if ((c & 0x80) == 0) continue;            /* ASCII */
1255
1256                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1257                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1258                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1259                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1260                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1261                 else return false;                        /* invalid leading byte */
1262
1263                 if (len > 2) return false;                /* Java limitation */
1264
1265                 v = (unsigned long)c & (0x3f >> len);
1266                 
1267                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1268
1269                 for (i = len; i--; ) {
1270                         c = *utf_ptr++;
1271                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1272                                 return false;
1273                         v = (v << 6) | (c & 0x3f);
1274                 }
1275
1276                 if (v == 0) {
1277                         if (len != 1) return false;           /* Java special */
1278
1279                 } else {
1280                         /* Sun Java seems to allow overlong UTF-8 encodings */
1281                         
1282                         /* if (v < min_codepoint[len]) */
1283                                 /* XXX throw exception? */
1284                 }
1285
1286                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1287                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1288
1289                 /* even these seem to be allowed */
1290                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1291         }
1292
1293         return true;
1294 }
1295
1296
1297 /* is_valid_name ***************************************************************
1298
1299    Return true if the given string may be used as a class/field/method
1300    name. (Currently this only disallows empty strings and control
1301    characters.)
1302
1303    NOTE: The string is assumed to have passed is_valid_utf!
1304
1305    utf_ptr...points to first character
1306    end_pos...points after last character
1307
1308 *******************************************************************************/
1309
1310 bool is_valid_name(char *utf_ptr, char *end_pos)
1311 {
1312         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1313
1314         while (utf_ptr < end_pos) {
1315                 unsigned char c = *utf_ptr++;
1316
1317                 if (c < 0x20) return false; /* disallow control characters */
1318                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1319                         return false;
1320         }
1321
1322         return true;
1323 }
1324
1325 bool is_valid_name_utf(utf *u)
1326 {
1327         return is_valid_name(u->text, UTF_END(u));
1328 }
1329
1330
1331 /* utf_show ********************************************************************
1332
1333    Writes the utf symbols in the utfhash to stdout and displays the
1334    number of external hash chains grouped according to the chainlength
1335    (for debugging purposes).
1336
1337 *******************************************************************************/
1338
1339 #if !defined(NDEBUG)
1340 void utf_show(void)
1341 {
1342
1343 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1344
1345         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1346         u4 max_chainlength = 0;      /* maximum length of the chains */
1347         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1348         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1349         u4 i;
1350
1351         printf("UTF-HASH:\n");
1352
1353         /* show element of utf-hashtable */
1354
1355         for (i = 0; i < hashtable_utf->size; i++) {
1356                 utf *u = hashtable_utf->ptr[i];
1357
1358                 if (u) {
1359                         printf("SLOT %d: ", (int) i);
1360
1361                         while (u) {
1362                                 printf("'");
1363                                 utf_display_printable_ascii(u);
1364                                 printf("' ");
1365                                 u = u->hashlink;
1366                         }       
1367                         printf("\n");
1368                 }
1369         }
1370
1371         printf("UTF-HASH: %d slots for %d entries\n", 
1372                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1373
1374         if (hashtable_utf->entries == 0)
1375                 return;
1376
1377         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1378
1379         for (i=0;i<CHAIN_LIMIT;i++)
1380                 chain_count[i]=0;
1381
1382         /* count numbers of hashchains according to their length */
1383         for (i=0; i<hashtable_utf->size; i++) {
1384                   
1385                 utf *u = (utf*) hashtable_utf->ptr[i];
1386                 u4 chain_length = 0;
1387
1388                 /* determine chainlength */
1389                 while (u) {
1390                         u = u->hashlink;
1391                         chain_length++;
1392                 }
1393
1394                 /* update sum of all chainlengths */
1395                 sum_chainlength+=chain_length;
1396
1397                 /* determine the maximum length of the chains */
1398                 if (chain_length>max_chainlength)
1399                         max_chainlength = chain_length;
1400
1401                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1402                 if (chain_length>=CHAIN_LIMIT) {
1403                         beyond_limit+=chain_length;
1404                         chain_length=CHAIN_LIMIT-1;
1405                 }
1406
1407                 /* update number of hashchains of current length */
1408                 chain_count[chain_length]++;
1409         }
1410
1411         /* display results */  
1412         for (i=1;i<CHAIN_LIMIT-1;i++) 
1413                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1414           
1415         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1416
1417
1418         printf("max. chainlength:%5d\n",max_chainlength);
1419
1420         /* avg. chainlength = sum of chainlengths / number of chains */
1421         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1422 }
1423 #endif /* !defined(NDEBUG) */
1424
1425
1426 /*
1427  * These are local overrides for various environment variables in Emacs.
1428  * Please do not remove this and leave it at the end of the file, where
1429  * Emacs will automagically detect them.
1430  * ---------------------------------------------------------------------
1431  * Local variables:
1432  * mode: c
1433  * indent-tabs-mode: t
1434  * c-basic-offset: 4
1435  * tab-width: 4
1436  * End:
1437  * vim:noexpandtab:sw=4:ts=4:
1438  */