24dabea1ebfdad86861b50d304c3d66f504d447f
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32                         Edwin Steiner
33
34    $Id: utf8.c 4921 2006-05-15 14:24:36Z twisti $
35
36 */
37
38
39 #include "config.h"
40
41 #include <string.h>
42 #include <assert.h>
43
44 #include "vm/types.h"
45
46 #include "mm/memory.h"
47
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/threads.h"
50 #endif
51
52 #include "vm/builtin.h"
53 #include "vm/exceptions.h"
54 #include "vm/hashtable.h"
55 #include "vm/options.h"
56 #include "vm/statistics.h"
57 #include "vm/stringlocal.h"
58 #include "vm/utf8.h"
59
60
61 /* global variables ***********************************************************/
62
63 /* hashsize must be power of 2 */
64
65 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
66
67 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
68
69
70 /* utf-symbols for pointer comparison of frequently used strings **************/
71
72 utf *utf_java_lang_Object;
73
74 utf *utf_java_lang_Class;
75 utf *utf_java_lang_ClassLoader;
76 utf *utf_java_lang_Cloneable;
77 utf *utf_java_lang_SecurityManager;
78 utf *utf_java_lang_String;
79 utf *utf_java_lang_System;
80 utf *utf_java_lang_ThreadGroup;
81 utf *utf_java_io_Serializable;
82
83 utf *utf_java_lang_Throwable;
84 utf *utf_java_lang_VMThrowable;
85 utf *utf_java_lang_Error;
86 utf *utf_java_lang_NoClassDefFoundError;
87 utf *utf_java_lang_LinkageError;
88 utf *utf_java_lang_NoSuchMethodError;
89 utf *utf_java_lang_OutOfMemoryError;
90
91 utf *utf_java_lang_Exception;
92 utf *utf_java_lang_ClassNotFoundException;
93 utf *utf_java_lang_IllegalArgumentException;
94 utf *utf_java_lang_IllegalMonitorStateException;
95
96 utf *utf_java_lang_NullPointerException;
97
98 utf* utf_java_lang_Void;
99 utf* utf_java_lang_Boolean;
100 utf* utf_java_lang_Byte;
101 utf* utf_java_lang_Character;
102 utf* utf_java_lang_Short;
103 utf* utf_java_lang_Integer;
104 utf* utf_java_lang_Long;
105 utf* utf_java_lang_Float;
106 utf* utf_java_lang_Double;
107
108 utf *utf_java_lang_StackTraceElement;
109 utf *utf_java_lang_reflect_Constructor;
110 utf *utf_java_lang_reflect_Field;
111 utf *utf_java_lang_reflect_Method;
112 utf *utf_java_util_Vector;
113
114 utf *utf_InnerClasses;                  /* InnerClasses                       */
115 utf *utf_ConstantValue;                 /* ConstantValue                      */
116 utf *utf_Code;                          /* Code                               */
117 utf *utf_Exceptions;                    /* Exceptions                         */
118 utf *utf_LineNumberTable;               /* LineNumberTable                    */
119 utf *utf_SourceFile;                    /* SourceFile                         */
120
121 utf *utf_init;                          /* <init>                             */
122 utf *utf_clinit;                        /* <clinit>                           */
123 utf *utf_clone;                         /* clone                              */
124 utf *utf_finalize;                      /* finalize                           */
125 utf *utf_run;                           /* run                                */
126
127 utf *utf_add;                           /* add                                */
128 utf *utf_remove;                        /* remove                             */
129 utf *utf_put;                           /* put                                */
130 utf *utf_get;                           /* get                                */
131 utf *utf_value;                         /* value                              */
132
133 utf *utf_fillInStackTrace;
134 utf *utf_getSystemClassLoader;
135 utf *utf_loadClass;
136 utf *utf_printStackTrace;
137
138 utf *utf_Z;                             /* Z                                  */
139 utf *utf_B;                             /* B                                  */
140 utf *utf_C;                             /* C                                  */
141 utf *utf_S;                             /* S                                  */
142 utf *utf_I;                             /* I                                  */
143 utf *utf_J;                             /* J                                  */
144 utf *utf_F;                             /* F                                  */
145 utf *utf_D;                             /* D                                  */
146
147 utf *utf_void__void;                    /* ()V                                */
148 utf *utf_boolean__void;                 /* (Z)V                               */
149 utf *utf_byte__void;                    /* (B)V                               */
150 utf *utf_char__void;                    /* (C)V                               */
151 utf *utf_short__void;                   /* (S)V                               */
152 utf *utf_int__void;                     /* (I)V                               */
153 utf *utf_long__void;                    /* (J)V                               */
154 utf *utf_float__void;                   /* (F)V                               */
155 utf *utf_double__void;                  /* (D)V                               */
156
157 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
158 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
159 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
160 utf *utf_java_lang_Object__java_lang_Object;
161 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
162 utf *utf_java_lang_String__java_lang_Class;
163 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
164
165 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
166
167 utf *array_packagename;
168
169
170 /* utf_init ********************************************************************
171
172    Initializes the utf8 subsystem.
173
174 *******************************************************************************/
175
176 bool utf8_init(void)
177 {
178         /* create utf8 hashtable */
179
180         hashtable_utf = NEW(hashtable);
181
182         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
183
184 #if defined(ENABLE_STATISTICS)
185         if (opt_stat)
186                 count_utf_len += sizeof(utf*) * hashtable_utf.size;
187 #endif
188
189         /* create utf-symbols for pointer comparison of frequently used strings */
190
191         utf_java_lang_Object           = utf_new_char("java/lang/Object");
192
193         utf_java_lang_Class            = utf_new_char("java/lang/Class");
194         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
195         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
196         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
197         utf_java_lang_String           = utf_new_char("java/lang/String");
198         utf_java_lang_System           = utf_new_char("java/lang/System");
199         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
200         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
201
202         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
203         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
204         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
205
206         utf_java_lang_NoClassDefFoundError =
207                 utf_new_char(string_java_lang_NoClassDefFoundError);
208
209         utf_java_lang_LinkageError =
210                 utf_new_char(string_java_lang_LinkageError);
211
212         utf_java_lang_NoSuchMethodError =
213                 utf_new_char(string_java_lang_NoSuchMethodError);
214
215         utf_java_lang_OutOfMemoryError =
216                 utf_new_char(string_java_lang_OutOfMemoryError);
217
218         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
219
220         utf_java_lang_ClassNotFoundException =
221                 utf_new_char(string_java_lang_ClassNotFoundException);
222
223         utf_java_lang_IllegalArgumentException =
224                 utf_new_char(string_java_lang_IllegalArgumentException);
225
226         utf_java_lang_IllegalMonitorStateException =
227                 utf_new_char(string_java_lang_IllegalMonitorStateException);
228
229         utf_java_lang_NullPointerException =
230                 utf_new_char(string_java_lang_NullPointerException);
231
232         utf_java_lang_Void             = utf_new_char("java/lang/Void");
233         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
234         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
235         utf_java_lang_Character        = utf_new_char("java/lang/Character");
236         utf_java_lang_Short            = utf_new_char("java/lang/Short");
237         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
238         utf_java_lang_Long             = utf_new_char("java/lang/Long");
239         utf_java_lang_Float            = utf_new_char("java/lang/Float");
240         utf_java_lang_Double           = utf_new_char("java/lang/Double");
241
242         utf_java_lang_StackTraceElement =
243                 utf_new_char("java/lang/StackTraceElement");
244
245         utf_java_lang_reflect_Constructor =
246                 utf_new_char("java/lang/reflect/Constructor");
247
248         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
249         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
250         utf_java_util_Vector           = utf_new_char("java/util/Vector");
251
252         utf_InnerClasses               = utf_new_char("InnerClasses");
253         utf_ConstantValue              = utf_new_char("ConstantValue");
254         utf_Code                       = utf_new_char("Code");
255         utf_Exceptions                 = utf_new_char("Exceptions");
256         utf_LineNumberTable            = utf_new_char("LineNumberTable");
257         utf_SourceFile                 = utf_new_char("SourceFile");
258
259         utf_init                           = utf_new_char("<init>");
260         utf_clinit                         = utf_new_char("<clinit>");
261         utf_clone                      = utf_new_char("clone");
262         utf_finalize                   = utf_new_char("finalize");
263         utf_run                        = utf_new_char("run");
264
265         utf_add                        = utf_new_char("add");
266         utf_remove                     = utf_new_char("remove");
267         utf_put                        = utf_new_char("put");
268         utf_get                        = utf_new_char("get");
269         utf_value                      = utf_new_char("value");
270
271         utf_printStackTrace            = utf_new_char("printStackTrace");
272         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
273         utf_loadClass                  = utf_new_char("loadClass");
274         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
275
276         utf_Z                          = utf_new_char("Z");
277         utf_B                          = utf_new_char("B");
278         utf_C                          = utf_new_char("C");
279         utf_S                          = utf_new_char("S");
280         utf_I                          = utf_new_char("I");
281         utf_J                          = utf_new_char("J");
282         utf_F                          = utf_new_char("F");
283         utf_D                          = utf_new_char("D");
284
285         utf_void__void                 = utf_new_char("()V");
286         utf_boolean__void              = utf_new_char("(Z)V");
287         utf_byte__void                 = utf_new_char("(B)V");
288         utf_char__void                 = utf_new_char("(C)V");
289         utf_short__void                = utf_new_char("(S)V");
290         utf_int__void                  = utf_new_char("(I)V");
291         utf_long__void                 = utf_new_char("(J)V");
292         utf_float__void                = utf_new_char("(F)V");
293         utf_double__void               = utf_new_char("(D)V");
294         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
295         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
296
297         utf_void__java_lang_ClassLoader =
298                 utf_new_char("()Ljava/lang/ClassLoader;");
299
300         utf_java_lang_Object__java_lang_Object =
301                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
302
303         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
304
305         utf_java_lang_String__java_lang_Class =
306                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
307
308         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
309
310         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
311
312         array_packagename              = utf_new_char("\t<the array package>");
313
314         /* everything's ok */
315
316         return true;
317 }
318
319
320 /* utf_hashkey *****************************************************************
321
322    The hashkey is computed from the utf-text by using up to 8
323    characters.  For utf-symbols longer than 15 characters 3 characters
324    are taken from the beginning and the end, 2 characters are taken
325    from the middle.
326
327 *******************************************************************************/
328
329 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
330 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
331
332 u4 utf_hashkey(const char *text, u4 length)
333 {
334         const char *start_pos = text;       /* pointer to utf text                */
335         u4 a;
336
337         switch (length) {
338         case 0: /* empty string */
339                 return 0;
340
341         case 1: return fbs(0);
342         case 2: return fbs(0) ^ nbs(3);
343         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
344         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
345         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
346         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
347         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
348         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
349
350         case 9:
351                 a = fbs(0);
352                 a ^= nbs(1);
353                 a ^= nbs(2);
354                 text++;
355                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
356
357         case 10:
358                 a = fbs(0);
359                 text++;
360                 a ^= nbs(2);
361                 a ^= nbs(3);
362                 a ^= nbs(4);
363                 text++;
364                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
365
366         case 11:
367                 a = fbs(0);
368                 text++;
369                 a ^= nbs(2);
370                 a ^= nbs(3);
371                 a ^= nbs(4);
372                 text++;
373                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
374
375         case 12:
376                 a = fbs(0);
377                 text += 2;
378                 a ^= nbs(2);
379                 a ^= nbs(3);
380                 text++;
381                 a ^= nbs(5);
382                 a ^= nbs(6);
383                 a ^= nbs(7);
384                 text++;
385                 return a ^ nbs(9) ^ nbs(10);
386
387         case 13:
388                 a = fbs(0);
389                 a ^= nbs(1);
390                 text++;
391                 a ^= nbs(3);
392                 a ^= nbs(4);
393                 text += 2;      
394                 a ^= nbs(7);
395                 a ^= nbs(8);
396                 text += 2;
397                 return a ^ nbs(9) ^ nbs(10);
398
399         case 14:
400                 a = fbs(0);
401                 text += 2;      
402                 a ^= nbs(3);
403                 a ^= nbs(4);
404                 text += 2;      
405                 a ^= nbs(7);
406                 a ^= nbs(8);
407                 text += 2;
408                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
409
410         case 15:
411                 a = fbs(0);
412                 text += 2;      
413                 a ^= nbs(3);
414                 a ^= nbs(4);
415                 text += 2;      
416                 a ^= nbs(7);
417                 a ^= nbs(8);
418                 text += 2;
419                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
420
421         default:  /* 3 characters from beginning */
422                 a = fbs(0);
423                 text += 2;
424                 a ^= nbs(3);
425                 a ^= nbs(4);
426
427                 /* 2 characters from middle */
428                 text = start_pos + (length / 2);
429                 a ^= fbs(5);
430                 text += 2;
431                 a ^= nbs(6);    
432
433                 /* 3 characters from end */
434                 text = start_pos + length - 4;
435
436                 a ^= fbs(7);
437                 text++;
438
439                 return a ^ nbs(10) ^ nbs(11);
440     }
441 }
442
443 /* utf_full_hashkey ************************************************************
444
445    This function computes a hash value using all bytes in the string.
446
447    The algorithm is the "One-at-a-time" algorithm as published
448    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
449
450 *******************************************************************************/
451
452 u4 utf_full_hashkey(const char *text, u4 length)
453 {
454         register const unsigned char *p = (const unsigned char *) text;
455         register u4 hash;
456         register u4 i;
457
458         hash = 0;
459         for (i=length; i--;)
460         {
461             hash += *p++;
462             hash += (hash << 10);
463             hash ^= (hash >> 6);
464         }
465         hash += (hash << 3);
466         hash ^= (hash >> 11);
467         hash += (hash << 15);
468
469         return hash;
470 }
471
472 /* unicode_hashkey *************************************************************
473
474    Compute the hashkey of a unicode string.
475
476 *******************************************************************************/
477
478 u4 unicode_hashkey(u2 *text, u2 len)
479 {
480         return utf_hashkey((char *) text, len);
481 }
482
483
484 /* utf_new *********************************************************************
485
486    Creates a new utf-symbol, the text of the symbol is passed as a
487    u1-array. The function searches the utf-hashtable for a utf-symbol
488    with this text. On success the element returned, otherwise a new
489    hashtable element is created.
490
491    If the number of entries in the hashtable exceeds twice the size of
492    the hashtable slots a reorganization of the hashtable is done and
493    the utf symbols are copied to a new hashtable with doubled size.
494
495 *******************************************************************************/
496
497 utf *utf_new(const char *text, u2 length)
498 {
499         u4 key;                             /* hashkey computed from utf-text     */
500         u4 slot;                            /* slot in hashtable                  */
501         utf *u;                             /* hashtable element                  */
502         u2 i;
503
504 #if defined(ENABLE_THREADS)
505         builtin_monitorenter(hashtable_utf->header);
506 #endif
507
508 #if defined(ENABLE_STATISTICS)
509         if (opt_stat)
510                 count_utf_new++;
511 #endif
512
513         key  = utf_hashkey(text, length);
514         slot = key & (hashtable_utf->size - 1);
515         u    = hashtable_utf->ptr[slot];
516
517         /* search external hash chain for utf-symbol */
518
519         while (u) {
520                 if (u->blength == length) {
521                         /* compare text of hashtable elements */
522
523                         for (i = 0; i < length; i++)
524                                 if (text[i] != u->text[i])
525                                         goto nomatch;
526                         
527 #if defined(ENABLE_STATISTICS)
528                         if (opt_stat)
529                                 count_utf_new_found++;
530 #endif
531
532                         /* symbol found in hashtable */
533
534 #if defined(ENABLE_THREADS)
535                         builtin_monitorexit(hashtable_utf->header);
536 #endif
537
538                         return u;
539                 }
540
541         nomatch:
542                 u = u->hashlink; /* next element in external chain */
543         }
544
545 #if defined(ENABLE_STATISTICS)
546         if (opt_stat)
547                 count_utf_len += sizeof(utf) + length + 1;
548 #endif
549
550         /* location in hashtable found, create new utf element */
551         u = NEW(utf);
552         u->blength  = length;               /* length in bytes of utfstring       */
553         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
554         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
555
556         memcpy(u->text, text, length);      /* copy utf-text                      */
557         u->text[length] = '\0';
558
559         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
560         hashtable_utf->entries++;           /* update number of entries           */
561
562         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
563
564         /* reorganization of hashtable, average length of the external
565            chains is approx. 2 */
566
567                 hashtable *newhash;                              /* the new hashtable */
568                 u4         i;
569                 utf       *u;
570                 utf       *nextu;
571                 u4         slot;
572
573                 /* create new hashtable, double the size */
574
575                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
576
577 #if defined(ENABLE_STATISTICS)
578                 if (opt_stat)
579                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
580 #endif
581
582                 /* transfer elements to new hashtable */
583
584                 for (i = 0; i < hashtable_utf->size; i++) {
585                         u = hashtable_utf->ptr[i];
586
587                         while (u) {
588                                 nextu = u->hashlink;
589                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
590                                                 
591                                 u->hashlink = (utf *) newhash->ptr[slot];
592                                 newhash->ptr[slot] = u;
593
594                                 /* follow link in external hash chain */
595
596                                 u = nextu;
597                         }
598                 }
599         
600                 /* dispose old table */
601
602                 hashtable_free(hashtable_utf);
603
604                 hashtable_utf = newhash;
605         }
606
607 #if defined(ENABLE_THREADS)
608         builtin_monitorexit(hashtable_utf->header);
609 #endif
610
611         return u;
612 }
613
614
615 /* utf_new_u2 ******************************************************************
616
617    Make utf symbol from u2 array, if isclassname is true '.' is
618    replaced by '/'.
619
620 *******************************************************************************/
621
622 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
623 {
624         char *buffer;                   /* memory buffer for  unicode characters  */
625         char *pos;                      /* pointer to current position in buffer  */
626         u4 left;                        /* unicode characters left                */
627         u4 buflength;                   /* utf length in bytes of the u2 array    */
628         utf *result;                    /* resulting utf-string                   */
629         int i;          
630
631         /* determine utf length in bytes and allocate memory */
632
633         buflength = u2_utflength(unicode_pos, unicode_length); 
634         buffer    = MNEW(char, buflength);
635  
636         left = buflength;
637         pos  = buffer;
638
639         for (i = 0; i++ < unicode_length; unicode_pos++) {
640                 /* next unicode character */
641                 u2 c = *unicode_pos;
642                 
643                 if ((c != 0) && (c < 0x80)) {
644                         /* 1 character */       
645                         left--;
646                 if ((int) left < 0) break;
647                         /* convert classname */
648                         if (isclassname && c == '.')
649                                 *pos++ = '/';
650                         else
651                                 *pos++ = (char) c;
652
653                 } else if (c < 0x800) {             
654                         /* 2 characters */                              
655                 unsigned char high = c >> 6;
656                 unsigned char low  = c & 0x3F;
657                         left = left - 2;
658                 if ((int) left < 0) break;
659                 *pos++ = high | 0xC0; 
660                 *pos++ = low  | 0x80;     
661
662                 } else {         
663                 /* 3 characters */                              
664                 char low  = c & 0x3f;
665                 char mid  = (c >> 6) & 0x3F;
666                 char high = c >> 12;
667                         left = left - 3;
668                 if ((int) left < 0) break;
669                 *pos++ = high | 0xE0; 
670                 *pos++ = mid  | 0x80;  
671                 *pos++ = low  | 0x80;   
672                 }
673         }
674         
675         /* insert utf-string into symbol-table */
676         result = utf_new(buffer,buflength);
677
678         MFREE(buffer, char, buflength);
679
680         return result;
681 }
682
683
684 /* utf_new_char ****************************************************************
685
686    Creates a new utf symbol, the text for this symbol is passed as a
687    c-string ( = char* ).
688
689 *******************************************************************************/
690
691 utf *utf_new_char(const char *text)
692 {
693         return utf_new(text, strlen(text));
694 }
695
696
697 /* utf_new_char_classname ******************************************************
698
699    Creates a new utf symbol, the text for this symbol is passed as a
700    c-string ( = char* ) "." characters are going to be replaced by
701    "/". Since the above function is used often, this is a separte
702    function, instead of an if.
703
704 *******************************************************************************/
705
706 utf *utf_new_char_classname(const char *text)
707 {
708         if (strchr(text, '.')) {
709                 char *txt = strdup(text);
710                 char *end = txt + strlen(txt);
711                 char *c;
712                 utf *tmpRes;
713
714                 for (c = txt; c < end; c++)
715                         if (*c == '.') *c = '/';
716
717                 tmpRes = utf_new(txt, strlen(txt));
718                 FREE(txt, 0);
719
720                 return tmpRes;
721
722         } else
723                 return utf_new(text, strlen(text));
724 }
725
726
727 /* utf_nextu2 ******************************************************************
728
729    Read the next unicode character from the utf string and increment
730    the utf-string pointer accordingly.
731
732 *******************************************************************************/
733
734 u2 utf_nextu2(char **utf_ptr)
735 {
736     /* uncompressed unicode character */
737     u2 unicode_char = 0;
738     /* current position in utf text */  
739     unsigned char *utf = (unsigned char *) (*utf_ptr);
740     /* bytes representing the unicode character */
741     unsigned char ch1, ch2, ch3;
742     /* number of bytes used to represent the unicode character */
743     int len = 0;
744         
745     switch ((ch1 = utf[0]) >> 4) {
746         default: /* 1 byte */
747                 (*utf_ptr)++;
748                 return (u2) ch1;
749         case 0xC: 
750         case 0xD: /* 2 bytes */
751                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
752                         unsigned char high = ch1 & 0x1F;
753                         unsigned char low  = ch2 & 0x3F;
754                         unicode_char = (high << 6) + low;
755                         len = 2;
756                 }
757                 break;
758
759         case 0xE: /* 2 or 3 bytes */
760                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
761                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
762                                 unsigned char low  = ch3 & 0x3f;
763                                 unsigned char mid  = ch2 & 0x3f;
764                                 unsigned char high = ch1 & 0x0f;
765                                 unicode_char = (((high << 6) + mid) << 6) + low;
766                                 len = 3;
767                         } else
768                                 len = 2;                                           
769                 }
770                 break;
771     }
772
773     /* update position in utf-text */
774     *utf_ptr = (char *) (utf + len);
775
776     return unicode_char;
777 }
778
779
780 /* utf_bytes *******************************************************************
781
782    Determine number of bytes (aka. octets) in the utf string.
783
784    IN:
785       u............utf string
786
787    OUT:
788       The number of octets of this utf string.
789           There is _no_ terminating zero included in this count.
790
791 *******************************************************************************/
792
793 u4 utf_bytes(utf *u)
794 {
795         return u->blength;
796 }
797
798 /* utf_get_number_of_u2s_for_buffer ********************************************
799
800    Determine number of UTF-16 u2s in the given UTF-8 buffer
801
802    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
803    to an array of u2s (UTF-16) and want to know how many of them you will get.
804    All other uses of this function are probably wrong.
805
806    IN:
807       buffer........points to first char in buffer
808           blength.......number of _bytes_ in the buffer
809
810    OUT:
811       the number of u2s needed to hold this string in UTF-16 encoding.
812           There is _no_ terminating zero included in this count.
813
814    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
815    exception.
816
817 *******************************************************************************/
818
819 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
820 {
821         const char *endpos;                 /* points behind utf string           */
822         const char *utf_ptr;                /* current position in utf text       */
823         u4 len = 0;                         /* number of unicode characters       */
824
825         utf_ptr = buffer;
826         endpos = utf_ptr + blength;
827
828         while (utf_ptr < endpos) {
829                 len++;
830                 /* next unicode character */
831                 utf_nextu2((char **)&utf_ptr);
832         }
833
834         assert(utf_ptr == endpos);
835
836         return len;
837 }
838
839
840 /* utf_get_number_of_u2s *******************************************************
841
842    Determine number of UTF-16 u2s in the utf string.
843
844    CAUTION: Use this function *only* when you want to convert a utf string
845    to an array of u2s and want to know how many of them you will get.
846    All other uses of this function are probably wrong.
847
848    IN:
849       u............utf string
850
851    OUT:
852       the number of u2s needed to hold this string in UTF-16 encoding.
853           There is _no_ terminating zero included in this count.
854           XXX 0 if a NullPointerException has been thrown (see below)
855
856 *******************************************************************************/
857
858 u4 utf_get_number_of_u2s(utf *u)
859 {
860         char *endpos;                       /* points behind utf string           */
861         char *utf_ptr;                      /* current position in utf text       */
862         u4 len = 0;                         /* number of unicode characters       */
863
864         /* XXX this is probably not checked by most callers! Review this after */
865         /* the invalid uses of this function have been eliminated */
866         if (!u) {
867                 exceptions_throw_nullpointerexception();
868                 return 0;
869         }
870
871         endpos = UTF_END(u);
872         utf_ptr = u->text;
873
874         while (utf_ptr < endpos) {
875                 len++;
876                 /* next unicode character */
877                 utf_nextu2(&utf_ptr);
878         }
879
880         if (utf_ptr != endpos)
881                 /* string ended abruptly */
882                 throw_cacao_exception_exit(string_java_lang_InternalError,
883                                                                    "Illegal utf8 string");
884
885         return len;
886 }
887
888
889 /* u2_utflength ****************************************************************
890
891    Returns the utf length in bytes of a u2 array.
892
893 *******************************************************************************/
894
895 u4 u2_utflength(u2 *text, u4 u2_length)
896 {
897         u4 result_len = 0;                  /* utf length in bytes                */
898         u2 ch;                              /* current unicode character          */
899         u4 len;
900         
901         for (len = 0; len < u2_length; len++) {
902                 /* next unicode character */
903                 ch = *text++;
904           
905                 /* determine bytes required to store unicode character as utf */
906                 if (ch && (ch < 0x80)) 
907                         result_len++;
908                 else if (ch < 0x800)
909                         result_len += 2;        
910                 else 
911                         result_len += 3;        
912         }
913
914     return result_len;
915 }
916
917
918 /* utf_copy ********************************************************************
919
920    Copy the given utf string byte-for-byte to a buffer.
921
922    IN:
923       buffer.......the buffer
924           u............the utf string
925
926 *******************************************************************************/
927
928 void utf_copy(char *buffer, utf *u)
929 {
930         /* our utf strings are zero-terminated (done by utf_new) */
931         MCOPY(buffer, u->text, char, u->blength + 1);
932 }
933
934
935 /* utf_cat *********************************************************************
936
937    Append the given utf string byte-for-byte to a buffer.
938
939    IN:
940       buffer.......the buffer
941           u............the utf string
942
943 *******************************************************************************/
944
945 void utf_cat(char *buffer, utf *u)
946 {
947         /* our utf strings are zero-terminated (done by utf_new) */
948         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
949 }
950
951
952 /* utf_copy_classname **********************************************************
953
954    Copy the given utf classname byte-for-byte to a buffer.
955    '/' is replaced by '.'
956
957    IN:
958       buffer.......the buffer
959           u............the utf string
960
961 *******************************************************************************/
962
963 void utf_copy_classname(char *buffer, utf *u)
964 {
965         char *bufptr;
966         char *srcptr;
967         char *endptr;
968         char ch;
969
970         bufptr = buffer;
971         srcptr = u->text;
972         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
973
974         while (srcptr != endptr) {
975                 ch = *srcptr++;
976                 if (ch == '/')
977                         ch = '.';
978                 *bufptr++ = ch;
979         }
980 }
981
982
983 /* utf_cat *********************************************************************
984
985    Append the given utf classname byte-for-byte to a buffer.
986    '/' is replaced by '.'
987
988    IN:
989       buffer.......the buffer
990           u............the utf string
991
992 *******************************************************************************/
993
994 void utf_cat_classname(char *buffer, utf *u)
995 {
996         utf_copy_classname(buffer + strlen(buffer), u);
997 }
998
999 /* utf_display_printable_ascii *************************************************
1000
1001    Write utf symbol to stdout (for debugging purposes).
1002    Non-printable and non-ASCII characters are printed as '?'.
1003
1004 *******************************************************************************/
1005
1006 void utf_display_printable_ascii(utf *u)
1007 {
1008         char *endpos;                       /* points behind utf string           */
1009         char *utf_ptr;                      /* current position in utf text       */
1010
1011         if (u == NULL) {
1012                 printf("NULL");
1013                 fflush(stdout);
1014                 return;
1015         }
1016
1017         endpos = UTF_END(u);
1018         utf_ptr = u->text;
1019
1020         while (utf_ptr < endpos) {
1021                 /* read next unicode character */
1022
1023                 u2 c = utf_nextu2(&utf_ptr);
1024
1025                 if ((c >= 32) && (c <= 127))
1026                         printf("%c", c);
1027                 else
1028                         printf("?");
1029         }
1030
1031         fflush(stdout);
1032 }
1033
1034
1035 /* utf_display_printable_ascii_classname ***************************************
1036
1037    Write utf symbol to stdout with `/' converted to `.' (for debugging
1038    purposes).
1039    Non-printable and non-ASCII characters are printed as '?'.
1040
1041 *******************************************************************************/
1042
1043 void utf_display_printable_ascii_classname(utf *u)
1044 {
1045         char *endpos;                       /* points behind utf string           */
1046         char *utf_ptr;                      /* current position in utf text       */
1047
1048         if (u == NULL) {
1049                 printf("NULL");
1050                 fflush(stdout);
1051                 return;
1052         }
1053
1054         endpos = UTF_END(u);
1055         utf_ptr = u->text;
1056
1057         while (utf_ptr < endpos) {
1058                 /* read next unicode character */
1059
1060                 u2 c = utf_nextu2(&utf_ptr);
1061
1062                 if (c == '/')
1063                         c = '.';
1064
1065                 if ((c >= 32) && (c <= 127))
1066                         printf("%c", c);
1067                 else
1068                         printf("?");
1069         }
1070
1071         fflush(stdout);
1072 }
1073
1074
1075 /* utf_sprint_convert_to_latin1 ************************************************
1076         
1077    Write utf symbol into c-string (for debugging purposes).
1078    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1079    invalid results.
1080
1081 *******************************************************************************/
1082
1083 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1084 {
1085         char *endpos;                       /* points behind utf string           */
1086         char *utf_ptr;                      /* current position in utf text       */
1087         u2 pos = 0;                         /* position in c-string               */
1088
1089         if (!u) {
1090                 strcpy(buffer, "NULL");
1091                 return;
1092         }
1093
1094         endpos = UTF_END(u);
1095         utf_ptr = u->text;
1096
1097         while (utf_ptr < endpos) 
1098                 /* copy next unicode character */       
1099                 buffer[pos++] = utf_nextu2(&utf_ptr);
1100
1101         /* terminate string */
1102         buffer[pos] = '\0';
1103 }
1104
1105
1106 /* utf_sprint_convert_to_latin1_classname **************************************
1107         
1108    Write utf symbol into c-string with `/' converted to `.' (for debugging
1109    purposes).
1110    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1111    invalid results.
1112
1113 *******************************************************************************/
1114
1115 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1116 {
1117         char *endpos;                       /* points behind utf string           */
1118         char *utf_ptr;                      /* current position in utf text       */
1119         u2 pos = 0;                         /* position in c-string               */
1120
1121         if (!u) {
1122                 strcpy(buffer, "NULL");
1123                 return;
1124         }
1125
1126         endpos = UTF_END(u);
1127         utf_ptr = u->text;
1128
1129         while (utf_ptr < endpos) {
1130                 /* copy next unicode character */       
1131                 u2 c = utf_nextu2(&utf_ptr);
1132                 if (c == '/') c = '.';
1133                 buffer[pos++] = c;
1134         }
1135
1136         /* terminate string */
1137         buffer[pos] = '\0';
1138 }
1139
1140
1141 /* utf_strcat_convert_to_latin1 ************************************************
1142         
1143    Like libc strcat, but uses an utf8 string.
1144    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1145    invalid results.
1146
1147 *******************************************************************************/
1148
1149 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1150 {
1151         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1152 }
1153
1154
1155 /* utf_strcat_convert_to_latin1_classname **************************************
1156         
1157    Like libc strcat, but uses an utf8 string.
1158    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1159    invalid results.
1160
1161 *******************************************************************************/
1162
1163 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1164 {
1165         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1166 }
1167
1168
1169 /* utf_fprint_printable_ascii **************************************************
1170         
1171    Write utf symbol into file.
1172    Non-printable and non-ASCII characters are printed as '?'.
1173
1174 *******************************************************************************/
1175
1176 void utf_fprint_printable_ascii(FILE *file, utf *u)
1177 {
1178         char *endpos;                       /* points behind utf string           */
1179         char *utf_ptr;                      /* current position in utf text       */
1180
1181         if (!u)
1182                 return;
1183
1184         endpos = UTF_END(u);
1185         utf_ptr = u->text;
1186
1187         while (utf_ptr < endpos) { 
1188                 /* read next unicode character */                
1189                 u2 c = utf_nextu2(&utf_ptr);                            
1190
1191                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1192                 else fprintf(file, "?");
1193         }
1194 }
1195
1196
1197 /* utf_fprint_printable_ascii_classname ****************************************
1198         
1199    Write utf symbol into file with `/' converted to `.'.
1200    Non-printable and non-ASCII characters are printed as '?'.
1201
1202 *******************************************************************************/
1203
1204 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1205 {
1206         char *endpos;                       /* points behind utf string           */
1207         char *utf_ptr;                      /* current position in utf text       */
1208
1209     if (!u)
1210                 return;
1211
1212         endpos = UTF_END(u);
1213         utf_ptr = u->text;
1214
1215         while (utf_ptr < endpos) { 
1216                 /* read next unicode character */                
1217                 u2 c = utf_nextu2(&utf_ptr);                            
1218                 if (c == '/') c = '.';
1219
1220                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1221                 else fprintf(file, "?");
1222         }
1223 }
1224
1225
1226 /* is_valid_utf ****************************************************************
1227
1228    Return true if the given string is a valid UTF-8 string.
1229
1230    utf_ptr...points to first character
1231    end_pos...points after last character
1232
1233 *******************************************************************************/
1234
1235 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1236
1237 bool is_valid_utf(char *utf_ptr, char *end_pos)
1238 {
1239         int bytes;
1240         int len,i;
1241         char c;
1242         unsigned long v;
1243
1244         if (end_pos < utf_ptr) return false;
1245         bytes = end_pos - utf_ptr;
1246         while (bytes--) {
1247                 c = *utf_ptr++;
1248
1249                 if (!c) return false;                     /* 0x00 is not allowed */
1250                 if ((c & 0x80) == 0) continue;            /* ASCII */
1251
1252                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1253                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1254                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1255                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1256                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1257                 else return false;                        /* invalid leading byte */
1258
1259                 if (len > 2) return false;                /* Java limitation */
1260
1261                 v = (unsigned long)c & (0x3f >> len);
1262                 
1263                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1264
1265                 for (i = len; i--; ) {
1266                         c = *utf_ptr++;
1267                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1268                                 return false;
1269                         v = (v << 6) | (c & 0x3f);
1270                 }
1271
1272                 if (v == 0) {
1273                         if (len != 1) return false;           /* Java special */
1274
1275                 } else {
1276                         /* Sun Java seems to allow overlong UTF-8 encodings */
1277                         
1278                         /* if (v < min_codepoint[len]) */
1279                                 /* XXX throw exception? */
1280                 }
1281
1282                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1283                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1284
1285                 /* even these seem to be allowed */
1286                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1287         }
1288
1289         return true;
1290 }
1291
1292
1293 /* is_valid_name ***************************************************************
1294
1295    Return true if the given string may be used as a class/field/method
1296    name. (Currently this only disallows empty strings and control
1297    characters.)
1298
1299    NOTE: The string is assumed to have passed is_valid_utf!
1300
1301    utf_ptr...points to first character
1302    end_pos...points after last character
1303
1304 *******************************************************************************/
1305
1306 bool is_valid_name(char *utf_ptr, char *end_pos)
1307 {
1308         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1309
1310         while (utf_ptr < end_pos) {
1311                 unsigned char c = *utf_ptr++;
1312
1313                 if (c < 0x20) return false; /* disallow control characters */
1314                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1315                         return false;
1316         }
1317
1318         return true;
1319 }
1320
1321 bool is_valid_name_utf(utf *u)
1322 {
1323         return is_valid_name(u->text, UTF_END(u));
1324 }
1325
1326
1327 /* utf_show ********************************************************************
1328
1329    Writes the utf symbols in the utfhash to stdout and displays the
1330    number of external hash chains grouped according to the chainlength
1331    (for debugging purposes).
1332
1333 *******************************************************************************/
1334
1335 #if !defined(NDEBUG)
1336 void utf_show(void)
1337 {
1338
1339 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1340
1341         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1342         u4 max_chainlength = 0;      /* maximum length of the chains */
1343         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1344         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1345         u4 i;
1346
1347         printf("UTF-HASH:\n");
1348
1349         /* show element of utf-hashtable */
1350
1351         for (i = 0; i < hashtable_utf->size; i++) {
1352                 utf *u = hashtable_utf->ptr[i];
1353
1354                 if (u) {
1355                         printf("SLOT %d: ", (int) i);
1356
1357                         while (u) {
1358                                 printf("'");
1359                                 utf_display_printable_ascii(u);
1360                                 printf("' ");
1361                                 u = u->hashlink;
1362                         }       
1363                         printf("\n");
1364                 }
1365         }
1366
1367         printf("UTF-HASH: %d slots for %d entries\n", 
1368                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1369
1370         if (hashtable_utf->entries == 0)
1371                 return;
1372
1373         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1374
1375         for (i=0;i<CHAIN_LIMIT;i++)
1376                 chain_count[i]=0;
1377
1378         /* count numbers of hashchains according to their length */
1379         for (i=0; i<hashtable_utf->size; i++) {
1380                   
1381                 utf *u = (utf*) hashtable_utf->ptr[i];
1382                 u4 chain_length = 0;
1383
1384                 /* determine chainlength */
1385                 while (u) {
1386                         u = u->hashlink;
1387                         chain_length++;
1388                 }
1389
1390                 /* update sum of all chainlengths */
1391                 sum_chainlength+=chain_length;
1392
1393                 /* determine the maximum length of the chains */
1394                 if (chain_length>max_chainlength)
1395                         max_chainlength = chain_length;
1396
1397                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1398                 if (chain_length>=CHAIN_LIMIT) {
1399                         beyond_limit+=chain_length;
1400                         chain_length=CHAIN_LIMIT-1;
1401                 }
1402
1403                 /* update number of hashchains of current length */
1404                 chain_count[chain_length]++;
1405         }
1406
1407         /* display results */  
1408         for (i=1;i<CHAIN_LIMIT-1;i++) 
1409                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1410           
1411         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1412
1413
1414         printf("max. chainlength:%5d\n",max_chainlength);
1415
1416         /* avg. chainlength = sum of chainlengths / number of chains */
1417         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1418 }
1419 #endif /* !defined(NDEBUG) */
1420
1421
1422 /*
1423  * These are local overrides for various environment variables in Emacs.
1424  * Please do not remove this and leave it at the end of the file, where
1425  * Emacs will automagically detect them.
1426  * ---------------------------------------------------------------------
1427  * Local variables:
1428  * mode: c
1429  * indent-tabs-mode: t
1430  * c-basic-offset: 4
1431  * tab-width: 4
1432  * End:
1433  * vim:noexpandtab:sw=4:ts=4:
1434  */