* src/vm/class.c (vm/suck.h): Added.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28             Mark Probst
29             Andreas Krall
30             Christian Thalinger
31             Edwin Steiner
32
33    $Id: utf8.c 6216 2006-12-18 18:21:37Z twisti $
34
35 */
36
37
38 #include "config.h"
39
40 #include <string.h>
41 #include <assert.h>
42
43 #include "vm/types.h"
44
45 #include "mm/memory.h"
46
47 #if defined(ENABLE_THREADS)
48 # include "threads/native/lock.h"
49 #else
50 # include "threads/none/lock.h"
51 #endif
52
53 #include "vm/builtin.h"
54 #include "vm/exceptions.h"
55 #include "vm/hashtable.h"
56 #include "vm/options.h"
57 #include "vm/statistics.h"
58 #include "vm/stringlocal.h"
59 #include "vm/utf8.h"
60
61
62 /* global variables ***********************************************************/
63
64 /* hashsize must be power of 2 */
65
66 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
67
68 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
69
70
71 /* utf-symbols for pointer comparison of frequently used strings **************/
72
73 utf *utf_java_lang_Object;
74
75 utf *utf_java_lang_Class;
76 utf *utf_java_lang_ClassLoader;
77 utf *utf_java_lang_Cloneable;
78 utf *utf_java_lang_SecurityManager;
79 utf *utf_java_lang_String;
80 utf *utf_java_lang_System;
81 utf *utf_java_lang_ThreadGroup;
82 utf *utf_java_io_Serializable;
83
84 utf *utf_java_lang_Throwable;
85 utf *utf_java_lang_VMThrowable;
86 utf *utf_java_lang_Error;
87 utf *utf_java_lang_AbstractMethodError;
88 utf *utf_java_lang_LinkageError;
89 utf *utf_java_lang_NoClassDefFoundError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
92
93 utf *utf_java_lang_Exception;
94 utf *utf_java_lang_ClassCastException;
95 utf *utf_java_lang_ClassNotFoundException;
96 utf *utf_java_lang_IllegalArgumentException;
97 utf *utf_java_lang_IllegalMonitorStateException;
98
99 utf *utf_java_lang_NullPointerException;
100
101 utf* utf_java_lang_Void;
102 utf* utf_java_lang_Boolean;
103 utf* utf_java_lang_Byte;
104 utf* utf_java_lang_Character;
105 utf* utf_java_lang_Short;
106 utf* utf_java_lang_Integer;
107 utf* utf_java_lang_Long;
108 utf* utf_java_lang_Float;
109 utf* utf_java_lang_Double;
110
111 utf *utf_java_lang_StackTraceElement;
112 utf *utf_java_lang_reflect_Constructor;
113 utf *utf_java_lang_reflect_Field;
114 utf *utf_java_lang_reflect_Method;
115 utf *utf_java_util_Vector;
116
117 utf *utf_InnerClasses;                  /* InnerClasses                       */
118 utf *utf_ConstantValue;                 /* ConstantValue                      */
119 utf *utf_Code;                          /* Code                               */
120 utf *utf_Exceptions;                    /* Exceptions                         */
121 utf *utf_LineNumberTable;               /* LineNumberTable                    */
122 utf *utf_SourceFile;                    /* SourceFile                         */
123
124 #if defined(ENABLE_JAVASE)
125 utf *utf_EnclosingMethod;
126 utf *utf_Signature;
127 utf *utf_RuntimeVisibleAnnotations;
128 utf *utf_StackMapTable;
129 #endif
130
131 utf *utf_init;                          /* <init>                             */
132 utf *utf_clinit;                        /* <clinit>                           */
133 utf *utf_clone;                         /* clone                              */
134 utf *utf_finalize;                      /* finalize                           */
135 utf *utf_run;                           /* run                                */
136
137 utf *utf_add;
138 utf *utf_remove;
139 utf *utf_removeThread;
140 utf *utf_put;
141 utf *utf_get;
142 utf *utf_value;
143
144 utf *utf_fillInStackTrace;
145 utf *utf_getSystemClassLoader;
146 utf *utf_loadClass;
147 utf *utf_printStackTrace;
148
149 utf *utf_Z;                             /* Z                                  */
150 utf *utf_B;                             /* B                                  */
151 utf *utf_C;                             /* C                                  */
152 utf *utf_S;                             /* S                                  */
153 utf *utf_I;                             /* I                                  */
154 utf *utf_J;                             /* J                                  */
155 utf *utf_F;                             /* F                                  */
156 utf *utf_D;                             /* D                                  */
157
158 utf *utf_void__void;                    /* ()V                                */
159 utf *utf_boolean__void;                 /* (Z)V                               */
160 utf *utf_byte__void;                    /* (B)V                               */
161 utf *utf_char__void;                    /* (C)V                               */
162 utf *utf_short__void;                   /* (S)V                               */
163 utf *utf_int__void;                     /* (I)V                               */
164 utf *utf_long__void;                    /* (J)V                               */
165 utf *utf_float__void;                   /* (F)V                               */
166 utf *utf_double__void;                  /* (D)V                               */
167
168 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
169 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
170 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
171 utf *utf_java_lang_Object__java_lang_Object;
172 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
173 utf *utf_java_lang_String__java_lang_Class;
174 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
175 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
176
177 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
178 utf *utf_null;
179 utf *array_packagename;
180
181
182 /* utf_init ********************************************************************
183
184    Initializes the utf8 subsystem.
185
186 *******************************************************************************/
187
188 bool utf8_init(void)
189 {
190         /* create utf8 hashtable */
191
192         hashtable_utf = NEW(hashtable);
193
194         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
195
196 #if defined(ENABLE_STATISTICS)
197         if (opt_stat)
198                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
199 #endif
200
201         /* create utf-symbols for pointer comparison of frequently used strings */
202
203         utf_java_lang_Object           = utf_new_char("java/lang/Object");
204
205         utf_java_lang_Class            = utf_new_char("java/lang/Class");
206         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
207         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
208         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
209         utf_java_lang_String           = utf_new_char("java/lang/String");
210         utf_java_lang_System           = utf_new_char("java/lang/System");
211         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
212         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
213
214         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
215         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
216         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
217
218         utf_java_lang_AbstractMethodError =
219                 utf_new_char(string_java_lang_AbstractMethodError);
220
221         utf_java_lang_LinkageError =
222                 utf_new_char(string_java_lang_LinkageError);
223
224         utf_java_lang_NoClassDefFoundError =
225                 utf_new_char(string_java_lang_NoClassDefFoundError);
226
227         utf_java_lang_NoSuchMethodError =
228                 utf_new_char(string_java_lang_NoSuchMethodError);
229
230         utf_java_lang_OutOfMemoryError =
231                 utf_new_char(string_java_lang_OutOfMemoryError);
232
233         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
234
235         utf_java_lang_ClassCastException =
236                 utf_new_char(string_java_lang_ClassCastException);
237
238         utf_java_lang_ClassNotFoundException =
239                 utf_new_char(string_java_lang_ClassNotFoundException);
240
241         utf_java_lang_IllegalArgumentException =
242                 utf_new_char(string_java_lang_IllegalArgumentException);
243
244         utf_java_lang_IllegalMonitorStateException =
245                 utf_new_char(string_java_lang_IllegalMonitorStateException);
246
247         utf_java_lang_NullPointerException =
248                 utf_new_char(string_java_lang_NullPointerException);
249
250         utf_java_lang_Void             = utf_new_char("java/lang/Void");
251         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
252         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
253         utf_java_lang_Character        = utf_new_char("java/lang/Character");
254         utf_java_lang_Short            = utf_new_char("java/lang/Short");
255         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
256         utf_java_lang_Long             = utf_new_char("java/lang/Long");
257         utf_java_lang_Float            = utf_new_char("java/lang/Float");
258         utf_java_lang_Double           = utf_new_char("java/lang/Double");
259
260         utf_java_lang_StackTraceElement =
261                 utf_new_char("java/lang/StackTraceElement");
262
263         utf_java_lang_reflect_Constructor =
264                 utf_new_char("java/lang/reflect/Constructor");
265
266         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
267         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
268         utf_java_util_Vector           = utf_new_char("java/util/Vector");
269
270         utf_InnerClasses               = utf_new_char("InnerClasses");
271         utf_ConstantValue              = utf_new_char("ConstantValue");
272         utf_Code                       = utf_new_char("Code");
273         utf_Exceptions                 = utf_new_char("Exceptions");
274         utf_LineNumberTable            = utf_new_char("LineNumberTable");
275         utf_SourceFile                 = utf_new_char("SourceFile");
276
277 #if defined(ENABLE_JAVASE)
278         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
279         utf_Signature                  = utf_new_char("Signature");
280         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
281         utf_StackMapTable              = utf_new_char("StackMapTable");
282 #endif
283
284         utf_init                           = utf_new_char("<init>");
285         utf_clinit                         = utf_new_char("<clinit>");
286         utf_clone                      = utf_new_char("clone");
287         utf_finalize                   = utf_new_char("finalize");
288         utf_run                        = utf_new_char("run");
289
290         utf_add                        = utf_new_char("add");
291         utf_remove                     = utf_new_char("remove");
292         utf_removeThread               = utf_new_char("removeThread");
293         utf_put                        = utf_new_char("put");
294         utf_get                        = utf_new_char("get");
295         utf_value                      = utf_new_char("value");
296
297         utf_printStackTrace            = utf_new_char("printStackTrace");
298         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
299         utf_loadClass                  = utf_new_char("loadClass");
300         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
301
302         utf_Z                          = utf_new_char("Z");
303         utf_B                          = utf_new_char("B");
304         utf_C                          = utf_new_char("C");
305         utf_S                          = utf_new_char("S");
306         utf_I                          = utf_new_char("I");
307         utf_J                          = utf_new_char("J");
308         utf_F                          = utf_new_char("F");
309         utf_D                          = utf_new_char("D");
310
311         utf_void__void                 = utf_new_char("()V");
312         utf_boolean__void              = utf_new_char("(Z)V");
313         utf_byte__void                 = utf_new_char("(B)V");
314         utf_char__void                 = utf_new_char("(C)V");
315         utf_short__void                = utf_new_char("(S)V");
316         utf_int__void                  = utf_new_char("(I)V");
317         utf_long__void                 = utf_new_char("(J)V");
318         utf_float__void                = utf_new_char("(F)V");
319         utf_double__void               = utf_new_char("(D)V");
320         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
321         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
322
323         utf_void__java_lang_ClassLoader =
324                 utf_new_char("()Ljava/lang/ClassLoader;");
325
326         utf_java_lang_Object__java_lang_Object =
327                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
328
329         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
330
331         utf_java_lang_String__java_lang_Class =
332                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
333
334         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
335         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
336
337         utf_null                       = utf_new_char("null");
338         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
339         array_packagename              = utf_new_char("\t<the array package>");
340
341         /* everything's ok */
342
343         return true;
344 }
345
346
347 /* utf_hashkey *****************************************************************
348
349    The hashkey is computed from the utf-text by using up to 8
350    characters.  For utf-symbols longer than 15 characters 3 characters
351    are taken from the beginning and the end, 2 characters are taken
352    from the middle.
353
354 *******************************************************************************/
355
356 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
357 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
358
359 u4 utf_hashkey(const char *text, u4 length)
360 {
361         const char *start_pos = text;       /* pointer to utf text                */
362         u4 a;
363
364         switch (length) {
365         case 0: /* empty string */
366                 return 0;
367
368         case 1: return fbs(0);
369         case 2: return fbs(0) ^ nbs(3);
370         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
371         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
372         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
373         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
374         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
375         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
376
377         case 9:
378                 a = fbs(0);
379                 a ^= nbs(1);
380                 a ^= nbs(2);
381                 text++;
382                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
383
384         case 10:
385                 a = fbs(0);
386                 text++;
387                 a ^= nbs(2);
388                 a ^= nbs(3);
389                 a ^= nbs(4);
390                 text++;
391                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
392
393         case 11:
394                 a = fbs(0);
395                 text++;
396                 a ^= nbs(2);
397                 a ^= nbs(3);
398                 a ^= nbs(4);
399                 text++;
400                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
401
402         case 12:
403                 a = fbs(0);
404                 text += 2;
405                 a ^= nbs(2);
406                 a ^= nbs(3);
407                 text++;
408                 a ^= nbs(5);
409                 a ^= nbs(6);
410                 a ^= nbs(7);
411                 text++;
412                 return a ^ nbs(9) ^ nbs(10);
413
414         case 13:
415                 a = fbs(0);
416                 a ^= nbs(1);
417                 text++;
418                 a ^= nbs(3);
419                 a ^= nbs(4);
420                 text += 2;      
421                 a ^= nbs(7);
422                 a ^= nbs(8);
423                 text += 2;
424                 return a ^ nbs(9) ^ nbs(10);
425
426         case 14:
427                 a = fbs(0);
428                 text += 2;      
429                 a ^= nbs(3);
430                 a ^= nbs(4);
431                 text += 2;      
432                 a ^= nbs(7);
433                 a ^= nbs(8);
434                 text += 2;
435                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
436
437         case 15:
438                 a = fbs(0);
439                 text += 2;      
440                 a ^= nbs(3);
441                 a ^= nbs(4);
442                 text += 2;      
443                 a ^= nbs(7);
444                 a ^= nbs(8);
445                 text += 2;
446                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
447
448         default:  /* 3 characters from beginning */
449                 a = fbs(0);
450                 text += 2;
451                 a ^= nbs(3);
452                 a ^= nbs(4);
453
454                 /* 2 characters from middle */
455                 text = start_pos + (length / 2);
456                 a ^= fbs(5);
457                 text += 2;
458                 a ^= nbs(6);    
459
460                 /* 3 characters from end */
461                 text = start_pos + length - 4;
462
463                 a ^= fbs(7);
464                 text++;
465
466                 return a ^ nbs(10) ^ nbs(11);
467     }
468 }
469
470 /* utf_full_hashkey ************************************************************
471
472    This function computes a hash value using all bytes in the string.
473
474    The algorithm is the "One-at-a-time" algorithm as published
475    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
476
477 *******************************************************************************/
478
479 u4 utf_full_hashkey(const char *text, u4 length)
480 {
481         register const unsigned char *p = (const unsigned char *) text;
482         register u4 hash;
483         register u4 i;
484
485         hash = 0;
486         for (i=length; i--;)
487         {
488             hash += *p++;
489             hash += (hash << 10);
490             hash ^= (hash >> 6);
491         }
492         hash += (hash << 3);
493         hash ^= (hash >> 11);
494         hash += (hash << 15);
495
496         return hash;
497 }
498
499 /* unicode_hashkey *************************************************************
500
501    Compute the hashkey of a unicode string.
502
503 *******************************************************************************/
504
505 u4 unicode_hashkey(u2 *text, u2 len)
506 {
507         return utf_hashkey((char *) text, len);
508 }
509
510
511 /* utf_new *********************************************************************
512
513    Creates a new utf-symbol, the text of the symbol is passed as a
514    u1-array. The function searches the utf-hashtable for a utf-symbol
515    with this text. On success the element returned, otherwise a new
516    hashtable element is created.
517
518    If the number of entries in the hashtable exceeds twice the size of
519    the hashtable slots a reorganization of the hashtable is done and
520    the utf symbols are copied to a new hashtable with doubled size.
521
522 *******************************************************************************/
523
524 utf *utf_new(const char *text, u2 length)
525 {
526         u4 key;                             /* hashkey computed from utf-text     */
527         u4 slot;                            /* slot in hashtable                  */
528         utf *u;                             /* hashtable element                  */
529         u2 i;
530
531         LOCK_MONITOR_ENTER(hashtable_utf->header);
532
533 #if defined(ENABLE_STATISTICS)
534         if (opt_stat)
535                 count_utf_new++;
536 #endif
537
538         key  = utf_hashkey(text, length);
539         slot = key & (hashtable_utf->size - 1);
540         u    = hashtable_utf->ptr[slot];
541
542         /* search external hash chain for utf-symbol */
543
544         while (u) {
545                 if (u->blength == length) {
546                         /* compare text of hashtable elements */
547
548                         for (i = 0; i < length; i++)
549                                 if (text[i] != u->text[i])
550                                         goto nomatch;
551                         
552 #if defined(ENABLE_STATISTICS)
553                         if (opt_stat)
554                                 count_utf_new_found++;
555 #endif
556
557                         /* symbol found in hashtable */
558
559                         LOCK_MONITOR_EXIT(hashtable_utf->header);
560
561                         return u;
562                 }
563
564         nomatch:
565                 u = u->hashlink; /* next element in external chain */
566         }
567
568 #if defined(ENABLE_STATISTICS)
569         if (opt_stat)
570                 count_utf_len += sizeof(utf) + length + 1;
571 #endif
572
573         /* location in hashtable found, create new utf element */
574         u = NEW(utf);
575         u->blength  = length;               /* length in bytes of utfstring       */
576         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
577         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
578
579         memcpy(u->text, text, length);      /* copy utf-text                      */
580         u->text[length] = '\0';
581
582         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
583         hashtable_utf->entries++;           /* update number of entries           */
584
585         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
586
587         /* reorganization of hashtable, average length of the external
588            chains is approx. 2 */
589
590                 hashtable *newhash;                              /* the new hashtable */
591                 u4         i;
592                 utf       *u;
593                 utf       *nextu;
594                 u4         slot;
595
596                 /* create new hashtable, double the size */
597
598                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
599
600 #if defined(ENABLE_STATISTICS)
601                 if (opt_stat)
602                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
603 #endif
604
605                 /* transfer elements to new hashtable */
606
607                 for (i = 0; i < hashtable_utf->size; i++) {
608                         u = hashtable_utf->ptr[i];
609
610                         while (u) {
611                                 nextu = u->hashlink;
612                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
613                                                 
614                                 u->hashlink = (utf *) newhash->ptr[slot];
615                                 newhash->ptr[slot] = u;
616
617                                 /* follow link in external hash chain */
618
619                                 u = nextu;
620                         }
621                 }
622         
623                 /* dispose old table */
624
625                 hashtable_free(hashtable_utf);
626
627                 hashtable_utf = newhash;
628         }
629
630         LOCK_MONITOR_EXIT(hashtable_utf->header);
631
632         return u;
633 }
634
635
636 /* utf_new_u2 ******************************************************************
637
638    Make utf symbol from u2 array, if isclassname is true '.' is
639    replaced by '/'.
640
641 *******************************************************************************/
642
643 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
644 {
645         char *buffer;                   /* memory buffer for  unicode characters  */
646         char *pos;                      /* pointer to current position in buffer  */
647         u4 left;                        /* unicode characters left                */
648         u4 buflength;                   /* utf length in bytes of the u2 array    */
649         utf *result;                    /* resulting utf-string                   */
650         int i;          
651
652         /* determine utf length in bytes and allocate memory */
653
654         buflength = u2_utflength(unicode_pos, unicode_length); 
655         buffer    = MNEW(char, buflength);
656  
657         left = buflength;
658         pos  = buffer;
659
660         for (i = 0; i++ < unicode_length; unicode_pos++) {
661                 /* next unicode character */
662                 u2 c = *unicode_pos;
663                 
664                 if ((c != 0) && (c < 0x80)) {
665                         /* 1 character */       
666                         left--;
667                 if ((int) left < 0) break;
668                         /* convert classname */
669                         if (isclassname && c == '.')
670                                 *pos++ = '/';
671                         else
672                                 *pos++ = (char) c;
673
674                 } else if (c < 0x800) {             
675                         /* 2 characters */                              
676                 unsigned char high = c >> 6;
677                 unsigned char low  = c & 0x3F;
678                         left = left - 2;
679                 if ((int) left < 0) break;
680                 *pos++ = high | 0xC0; 
681                 *pos++ = low  | 0x80;     
682
683                 } else {         
684                 /* 3 characters */                              
685                 char low  = c & 0x3f;
686                 char mid  = (c >> 6) & 0x3F;
687                 char high = c >> 12;
688                         left = left - 3;
689                 if ((int) left < 0) break;
690                 *pos++ = high | 0xE0; 
691                 *pos++ = mid  | 0x80;  
692                 *pos++ = low  | 0x80;   
693                 }
694         }
695         
696         /* insert utf-string into symbol-table */
697         result = utf_new(buffer,buflength);
698
699         MFREE(buffer, char, buflength);
700
701         return result;
702 }
703
704
705 /* utf_new_char ****************************************************************
706
707    Creates a new utf symbol, the text for this symbol is passed as a
708    c-string ( = char* ).
709
710 *******************************************************************************/
711
712 utf *utf_new_char(const char *text)
713 {
714         return utf_new(text, strlen(text));
715 }
716
717
718 /* utf_new_char_classname ******************************************************
719
720    Creates a new utf symbol, the text for this symbol is passed as a
721    c-string ( = char* ) "." characters are going to be replaced by
722    "/". Since the above function is used often, this is a separte
723    function, instead of an if.
724
725 *******************************************************************************/
726
727 utf *utf_new_char_classname(const char *text)
728 {
729         if (strchr(text, '.')) {
730                 char *txt = strdup(text);
731                 char *end = txt + strlen(txt);
732                 char *c;
733                 utf *tmpRes;
734
735                 for (c = txt; c < end; c++)
736                         if (*c == '.') *c = '/';
737
738                 tmpRes = utf_new(txt, strlen(txt));
739                 FREE(txt, 0);
740
741                 return tmpRes;
742
743         } else
744                 return utf_new(text, strlen(text));
745 }
746
747
748 /* utf_nextu2 ******************************************************************
749
750    Read the next unicode character from the utf string and increment
751    the utf-string pointer accordingly.
752
753    CAUTION: This function is unsafe for input that was not checked 
754             by is_valid_utf!
755
756 *******************************************************************************/
757
758 u2 utf_nextu2(char **utf_ptr)
759 {
760     /* uncompressed unicode character */
761     u2 unicode_char = 0;
762     /* current position in utf text */  
763     unsigned char *utf = (unsigned char *) (*utf_ptr);
764     /* bytes representing the unicode character */
765     unsigned char ch1, ch2, ch3;
766     /* number of bytes used to represent the unicode character */
767     int len = 0;
768         
769     switch ((ch1 = utf[0]) >> 4) {
770         default: /* 1 byte */
771                 (*utf_ptr)++;
772                 return (u2) ch1;
773         case 0xC: 
774         case 0xD: /* 2 bytes */
775                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
776                         unsigned char high = ch1 & 0x1F;
777                         unsigned char low  = ch2 & 0x3F;
778                         unicode_char = (high << 6) + low;
779                         len = 2;
780                 }
781                 break;
782
783         case 0xE: /* 2 or 3 bytes */
784                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
785                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
786                                 unsigned char low  = ch3 & 0x3f;
787                                 unsigned char mid  = ch2 & 0x3f;
788                                 unsigned char high = ch1 & 0x0f;
789                                 unicode_char = (((high << 6) + mid) << 6) + low;
790                                 len = 3;
791                         } else
792                                 len = 2;                                           
793                 }
794                 break;
795     }
796
797     /* update position in utf-text */
798     *utf_ptr = (char *) (utf + len);
799
800     return unicode_char;
801 }
802
803
804 /* utf_bytes *******************************************************************
805
806    Determine number of bytes (aka. octets) in the utf string.
807
808    IN:
809       u............utf string
810
811    OUT:
812       The number of octets of this utf string.
813           There is _no_ terminating zero included in this count.
814
815 *******************************************************************************/
816
817 u4 utf_bytes(utf *u)
818 {
819         return u->blength;
820 }
821
822 /* utf_get_number_of_u2s_for_buffer ********************************************
823
824    Determine number of UTF-16 u2s in the given UTF-8 buffer
825
826    CAUTION: This function is unsafe for input that was not checked 
827             by is_valid_utf!
828
829    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
830    to an array of u2s (UTF-16) and want to know how many of them you will get.
831    All other uses of this function are probably wrong.
832
833    IN:
834       buffer........points to first char in buffer
835           blength.......number of _bytes_ in the buffer
836
837    OUT:
838       the number of u2s needed to hold this string in UTF-16 encoding.
839           There is _no_ terminating zero included in this count.
840
841    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
842    exception.
843
844 *******************************************************************************/
845
846 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
847 {
848         const char *endpos;                 /* points behind utf string           */
849         const char *utf_ptr;                /* current position in utf text       */
850         u4 len = 0;                         /* number of unicode characters       */
851
852         utf_ptr = buffer;
853         endpos = utf_ptr + blength;
854
855         while (utf_ptr < endpos) {
856                 len++;
857                 /* next unicode character */
858                 utf_nextu2((char **)&utf_ptr);
859         }
860
861         assert(utf_ptr == endpos);
862
863         return len;
864 }
865
866
867 /* utf_get_number_of_u2s *******************************************************
868
869    Determine number of UTF-16 u2s in the utf string.
870
871    CAUTION: This function is unsafe for input that was not checked 
872             by is_valid_utf!
873
874    CAUTION: Use this function *only* when you want to convert a utf string
875    to an array of u2s and want to know how many of them you will get.
876    All other uses of this function are probably wrong.
877
878    IN:
879       u............utf string
880
881    OUT:
882       the number of u2s needed to hold this string in UTF-16 encoding.
883           There is _no_ terminating zero included in this count.
884           XXX 0 if a NullPointerException has been thrown (see below)
885
886 *******************************************************************************/
887
888 u4 utf_get_number_of_u2s(utf *u)
889 {
890         char *endpos;                       /* points behind utf string           */
891         char *utf_ptr;                      /* current position in utf text       */
892         u4 len = 0;                         /* number of unicode characters       */
893
894         /* XXX this is probably not checked by most callers! Review this after */
895         /* the invalid uses of this function have been eliminated */
896         if (!u) {
897                 exceptions_throw_nullpointerexception();
898                 return 0;
899         }
900
901         endpos = UTF_END(u);
902         utf_ptr = u->text;
903
904         while (utf_ptr < endpos) {
905                 len++;
906                 /* next unicode character */
907                 utf_nextu2(&utf_ptr);
908         }
909
910         if (utf_ptr != endpos)
911                 /* string ended abruptly */
912                 throw_cacao_exception_exit(string_java_lang_InternalError,
913                                                                    "Illegal utf8 string");
914
915         return len;
916 }
917
918
919 /* utf8_safe_number_of_u2s *****************************************************
920
921    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
922    (For invalid UTF-8 the U+fffd replacement character will be counted.)
923
924    This function is safe even for invalid UTF-8 strings.
925
926    IN:
927       text..........zero-terminated(!) UTF-8 string (may be invalid)
928                         must NOT be NULL
929           nbytes........strlen(text). (This is needed to completely emulate
930                         the RI).
931
932    OUT:
933       the number of u2s needed to hold this string in UTF-16 encoding.
934           There is _no_ terminating zero included in this count.
935
936 *******************************************************************************/
937
938 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
939         register const unsigned char *t;
940         register s4 byte;
941         register s4 len;
942         register const unsigned char *tlimit;
943         s4 byte1;
944         s4 byte2;
945         s4 byte3;
946         s4 value;
947         s4 skip;
948
949         assert(text);
950         assert(nbytes >= 0);
951
952         len = 0;
953         t = (const unsigned char *) text;
954         tlimit = t + nbytes;
955
956         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
957
958         while (1) {
959                 byte = *t++;
960
961                 if (byte & 0x80) {
962                         /* highest bit set, non-ASCII character */
963
964                         if ((byte & 0xe0) == 0xc0) {
965                                 /* 2-byte: should be 110..... 10...... ? */
966
967                                 if ((*t++ & 0xc0) == 0x80)
968                                         ; /* valid 2-byte */
969                                 else
970                                         t--; /* invalid */
971                         }
972                         else if ((byte & 0xf0) == 0xe0) {
973                                 /* 3-byte: should be 1110.... 10...... 10...... */
974                                 /*                            ^t                */
975
976                                 if (t + 2 > tlimit)
977                                         return len + 1; /* invalid, stop here */
978
979                                 if ((*t++ & 0xc0) == 0x80) {
980                                         if ((*t++ & 0xc0) == 0x80)
981                                                 ; /* valid 3-byte */
982                                         else
983                                                 t--; /* invalid */
984                                 }
985                                 else
986                                         t--; /* invalid */
987                         }
988                         else if ((byte & 0xf8) == 0xf0) {
989                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
990                                 /*                            ^t                         */
991
992                                 if (t + 3 > tlimit)
993                                         return len + 1; /* invalid, stop here */
994
995                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
996                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
997                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
998                                                         /* valid 4-byte UTF-8? */
999                                                         value = ((byte  & 0x07) << 18)
1000                                                                   | ((byte1 & 0x3f) << 12)
1001                                                                   | ((byte2 & 0x3f) <<  6)
1002                                                                   | ((byte3 & 0x3f)      );
1003
1004                                                         if (value > 0x10FFFF)
1005                                                                 ; /* invalid */
1006                                                         else if (value > 0xFFFF)
1007                                                                 len += 1; /* we need surrogates */
1008                                                         else
1009                                                                 ; /* 16bit suffice */
1010                                                 }
1011                                                 else
1012                                                         t--; /* invalid */
1013                                         }
1014                                         else
1015                                                 t--; /* invalid */
1016                                 }
1017                                 else
1018                                         t--; /* invalid */
1019                         }
1020                         else if ((byte & 0xfc) == 0xf8) {
1021                                 /* invalid 5-byte */
1022                                 if (t + 4 > tlimit)
1023                                         return len + 1; /* invalid, stop here */
1024
1025                                 skip = 4;
1026                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1027                                         t++;
1028                         }
1029                         else if ((byte & 0xfe) == 0xfc) {
1030                                 /* invalid 6-byte */
1031                                 if (t + 5 > tlimit)
1032                                         return len + 1; /* invalid, stop here */
1033
1034                                 skip = 5;
1035                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1036                                         t++;
1037                         }
1038                         else
1039                                 ; /* invalid */
1040                 }
1041                 else {
1042                         /* NUL */
1043
1044                         if (byte == 0)
1045                                 break;
1046
1047                         /* ASCII character, common case */
1048                 }
1049
1050                 len++;
1051         }
1052
1053         return len;
1054 }
1055
1056
1057 /* utf8_safe_convert_to_u2s ****************************************************
1058
1059    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1060    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1061    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1062
1063    This function is safe even for invalid UTF-8 strings.
1064
1065    IN:
1066       text..........zero-terminated(!) UTF-8 string (may be invalid)
1067                         must NOT be NULL
1068           nbytes........strlen(text). (This is needed to completely emulate
1069                                         the RI).
1070           buffer........a preallocated array of u2s to receive the decoded
1071                         string. Use utf8_safe_number_of_u2s to get the
1072                                         required number of u2s for allocating this.
1073
1074 *******************************************************************************/
1075
1076 #define UNICODE_REPLACEMENT  0xfffd
1077
1078 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1079         register const unsigned char *t;
1080         register s4 byte;
1081         register const unsigned char *tlimit;
1082         s4 byte1;
1083         s4 byte2;
1084         s4 byte3;
1085         s4 value;
1086         s4 skip;
1087
1088         assert(text);
1089         assert(nbytes >= 0);
1090
1091         t = (const unsigned char *) text;
1092         tlimit = t + nbytes;
1093
1094         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1095
1096         while (1) {
1097                 byte = *t++;
1098
1099                 if (byte & 0x80) {
1100                         /* highest bit set, non-ASCII character */
1101
1102                         if ((byte & 0xe0) == 0xc0) {
1103                                 /* 2-byte: should be 110..... 10...... */
1104
1105                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1106                                         /* valid 2-byte UTF-8 */
1107                                         *buffer++ = ((byte  & 0x1f) << 6)
1108                                                           | ((byte1 & 0x3f)     );
1109                                 }
1110                                 else {
1111                                         *buffer++ = UNICODE_REPLACEMENT;
1112                                         t--;
1113                                 }
1114                         }
1115                         else if ((byte & 0xf0) == 0xe0) {
1116                                 /* 3-byte: should be 1110.... 10...... 10...... */
1117
1118                                 if (t + 2 > tlimit) {
1119                                         *buffer++ = UNICODE_REPLACEMENT;
1120                                         return;
1121                                 }
1122
1123                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1124                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1125                                                 /* valid 3-byte UTF-8 */
1126                                                 *buffer++ = ((byte  & 0x0f) << 12)
1127                                                                   | ((byte1 & 0x3f) <<  6)
1128                                                                   | ((byte2 & 0x3f)      );
1129                                         }
1130                                         else {
1131                                                 *buffer++ = UNICODE_REPLACEMENT;
1132                                                 t--;
1133                                         }
1134                                 }
1135                                 else {
1136                                         *buffer++ = UNICODE_REPLACEMENT;
1137                                         t--;
1138                                 }
1139                         }
1140                         else if ((byte & 0xf8) == 0xf0) {
1141                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1142
1143                                 if (t + 3 > tlimit) {
1144                                         *buffer++ = UNICODE_REPLACEMENT;
1145                                         return;
1146                                 }
1147
1148                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1149                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1150                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1151                                                         /* valid 4-byte UTF-8? */
1152                                                         value = ((byte  & 0x07) << 18)
1153                                                                   | ((byte1 & 0x3f) << 12)
1154                                                                   | ((byte2 & 0x3f) <<  6)
1155                                                                   | ((byte3 & 0x3f)      );
1156
1157                                                         if (value > 0x10FFFF) {
1158                                                                 *buffer++ = UNICODE_REPLACEMENT;
1159                                                         }
1160                                                         else if (value > 0xFFFF) {
1161                                                                 /* we need surrogates */
1162                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1163                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1164                                                         }
1165                                                         else
1166                                                                 *buffer++ = value; /* 16bit suffice */
1167                                                 }
1168                                                 else {
1169                                                         *buffer++ = UNICODE_REPLACEMENT;
1170                                                         t--;
1171                                                 }
1172                                         }
1173                                         else {
1174                                                 *buffer++ = UNICODE_REPLACEMENT;
1175                                                 t--;
1176                                         }
1177                                 }
1178                                 else {
1179                                         *buffer++ = UNICODE_REPLACEMENT;
1180                                         t--;
1181                                 }
1182                         }
1183                         else if ((byte & 0xfc) == 0xf8) {
1184                                 if (t + 4 > tlimit) {
1185                                         *buffer++ = UNICODE_REPLACEMENT;
1186                                         return;
1187                                 }
1188
1189                                 skip = 4;
1190                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1191                                         t++;
1192                                 *buffer++ = UNICODE_REPLACEMENT;
1193                         }
1194                         else if ((byte & 0xfe) == 0xfc) {
1195                                 if (t + 5 > tlimit) {
1196                                         *buffer++ = UNICODE_REPLACEMENT;
1197                                         return;
1198                                 }
1199
1200                                 skip = 5;
1201                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1202                                         t++;
1203                                 *buffer++ = UNICODE_REPLACEMENT;
1204                         }
1205                         else
1206                                 *buffer++ = UNICODE_REPLACEMENT;
1207                 }
1208                 else {
1209                         /* NUL */
1210
1211                         if (byte == 0)
1212                                 break;
1213
1214                         /* ASCII character, common case */
1215
1216                         *buffer++ = byte;
1217                 }
1218         }
1219 }
1220
1221
1222 /* u2_utflength ****************************************************************
1223
1224    Returns the utf length in bytes of a u2 array.
1225
1226 *******************************************************************************/
1227
1228 u4 u2_utflength(u2 *text, u4 u2_length)
1229 {
1230         u4 result_len = 0;                  /* utf length in bytes                */
1231         u2 ch;                              /* current unicode character          */
1232         u4 len;
1233         
1234         for (len = 0; len < u2_length; len++) {
1235                 /* next unicode character */
1236                 ch = *text++;
1237           
1238                 /* determine bytes required to store unicode character as utf */
1239                 if (ch && (ch < 0x80)) 
1240                         result_len++;
1241                 else if (ch < 0x800)
1242                         result_len += 2;        
1243                 else 
1244                         result_len += 3;        
1245         }
1246
1247     return result_len;
1248 }
1249
1250
1251 /* utf_copy ********************************************************************
1252
1253    Copy the given utf string byte-for-byte to a buffer.
1254
1255    IN:
1256       buffer.......the buffer
1257           u............the utf string
1258
1259 *******************************************************************************/
1260
1261 void utf_copy(char *buffer, utf *u)
1262 {
1263         /* our utf strings are zero-terminated (done by utf_new) */
1264         MCOPY(buffer, u->text, char, u->blength + 1);
1265 }
1266
1267
1268 /* utf_cat *********************************************************************
1269
1270    Append the given utf string byte-for-byte to a buffer.
1271
1272    IN:
1273       buffer.......the buffer
1274           u............the utf string
1275
1276 *******************************************************************************/
1277
1278 void utf_cat(char *buffer, utf *u)
1279 {
1280         /* our utf strings are zero-terminated (done by utf_new) */
1281         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1282 }
1283
1284
1285 /* utf_copy_classname **********************************************************
1286
1287    Copy the given utf classname byte-for-byte to a buffer.
1288    '/' is replaced by '.'
1289
1290    IN:
1291       buffer.......the buffer
1292           u............the utf string
1293
1294 *******************************************************************************/
1295
1296 void utf_copy_classname(char *buffer, utf *u)
1297 {
1298         char *bufptr;
1299         char *srcptr;
1300         char *endptr;
1301         char ch;
1302
1303         bufptr = buffer;
1304         srcptr = u->text;
1305         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1306
1307         while (srcptr != endptr) {
1308                 ch = *srcptr++;
1309                 if (ch == '/')
1310                         ch = '.';
1311                 *bufptr++ = ch;
1312         }
1313 }
1314
1315
1316 /* utf_cat *********************************************************************
1317
1318    Append the given utf classname byte-for-byte to a buffer.
1319    '/' is replaced by '.'
1320
1321    IN:
1322       buffer.......the buffer
1323           u............the utf string
1324
1325 *******************************************************************************/
1326
1327 void utf_cat_classname(char *buffer, utf *u)
1328 {
1329         utf_copy_classname(buffer + strlen(buffer), u);
1330 }
1331
1332 /* utf_display_printable_ascii *************************************************
1333
1334    Write utf symbol to stdout (for debugging purposes).
1335    Non-printable and non-ASCII characters are printed as '?'.
1336
1337 *******************************************************************************/
1338
1339 void utf_display_printable_ascii(utf *u)
1340 {
1341         char *endpos;                       /* points behind utf string           */
1342         char *utf_ptr;                      /* current position in utf text       */
1343
1344         if (u == NULL) {
1345                 printf("NULL");
1346                 fflush(stdout);
1347                 return;
1348         }
1349
1350         endpos = UTF_END(u);
1351         utf_ptr = u->text;
1352
1353         while (utf_ptr < endpos) {
1354                 /* read next unicode character */
1355
1356                 u2 c = utf_nextu2(&utf_ptr);
1357
1358                 if ((c >= 32) && (c <= 127))
1359                         printf("%c", c);
1360                 else
1361                         printf("?");
1362         }
1363
1364         fflush(stdout);
1365 }
1366
1367
1368 /* utf_display_printable_ascii_classname ***************************************
1369
1370    Write utf symbol to stdout with `/' converted to `.' (for debugging
1371    purposes).
1372    Non-printable and non-ASCII characters are printed as '?'.
1373
1374 *******************************************************************************/
1375
1376 void utf_display_printable_ascii_classname(utf *u)
1377 {
1378         char *endpos;                       /* points behind utf string           */
1379         char *utf_ptr;                      /* current position in utf text       */
1380
1381         if (u == NULL) {
1382                 printf("NULL");
1383                 fflush(stdout);
1384                 return;
1385         }
1386
1387         endpos = UTF_END(u);
1388         utf_ptr = u->text;
1389
1390         while (utf_ptr < endpos) {
1391                 /* read next unicode character */
1392
1393                 u2 c = utf_nextu2(&utf_ptr);
1394
1395                 if (c == '/')
1396                         c = '.';
1397
1398                 if ((c >= 32) && (c <= 127))
1399                         printf("%c", c);
1400                 else
1401                         printf("?");
1402         }
1403
1404         fflush(stdout);
1405 }
1406
1407
1408 /* utf_sprint_convert_to_latin1 ************************************************
1409         
1410    Write utf symbol into c-string (for debugging purposes).
1411    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1412    invalid results.
1413
1414 *******************************************************************************/
1415
1416 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1417 {
1418         char *endpos;                       /* points behind utf string           */
1419         char *utf_ptr;                      /* current position in utf text       */
1420         u2 pos = 0;                         /* position in c-string               */
1421
1422         if (!u) {
1423                 strcpy(buffer, "NULL");
1424                 return;
1425         }
1426
1427         endpos = UTF_END(u);
1428         utf_ptr = u->text;
1429
1430         while (utf_ptr < endpos) 
1431                 /* copy next unicode character */       
1432                 buffer[pos++] = utf_nextu2(&utf_ptr);
1433
1434         /* terminate string */
1435         buffer[pos] = '\0';
1436 }
1437
1438
1439 /* utf_sprint_convert_to_latin1_classname **************************************
1440         
1441    Write utf symbol into c-string with `/' converted to `.' (for debugging
1442    purposes).
1443    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1444    invalid results.
1445
1446 *******************************************************************************/
1447
1448 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1449 {
1450         char *endpos;                       /* points behind utf string           */
1451         char *utf_ptr;                      /* current position in utf text       */
1452         u2 pos = 0;                         /* position in c-string               */
1453
1454         if (!u) {
1455                 strcpy(buffer, "NULL");
1456                 return;
1457         }
1458
1459         endpos = UTF_END(u);
1460         utf_ptr = u->text;
1461
1462         while (utf_ptr < endpos) {
1463                 /* copy next unicode character */       
1464                 u2 c = utf_nextu2(&utf_ptr);
1465                 if (c == '/') c = '.';
1466                 buffer[pos++] = c;
1467         }
1468
1469         /* terminate string */
1470         buffer[pos] = '\0';
1471 }
1472
1473
1474 /* utf_strcat_convert_to_latin1 ************************************************
1475         
1476    Like libc strcat, but uses an utf8 string.
1477    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1478    invalid results.
1479
1480 *******************************************************************************/
1481
1482 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1483 {
1484         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1485 }
1486
1487
1488 /* utf_strcat_convert_to_latin1_classname **************************************
1489         
1490    Like libc strcat, but uses an utf8 string.
1491    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1492    invalid results.
1493
1494 *******************************************************************************/
1495
1496 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1497 {
1498         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1499 }
1500
1501
1502 /* utf_fprint_printable_ascii **************************************************
1503         
1504    Write utf symbol into file.
1505    Non-printable and non-ASCII characters are printed as '?'.
1506
1507 *******************************************************************************/
1508
1509 void utf_fprint_printable_ascii(FILE *file, utf *u)
1510 {
1511         char *endpos;                       /* points behind utf string           */
1512         char *utf_ptr;                      /* current position in utf text       */
1513
1514         if (!u)
1515                 return;
1516
1517         endpos = UTF_END(u);
1518         utf_ptr = u->text;
1519
1520         while (utf_ptr < endpos) { 
1521                 /* read next unicode character */                
1522                 u2 c = utf_nextu2(&utf_ptr);                            
1523
1524                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1525                 else fprintf(file, "?");
1526         }
1527 }
1528
1529
1530 /* utf_fprint_printable_ascii_classname ****************************************
1531         
1532    Write utf symbol into file with `/' converted to `.'.
1533    Non-printable and non-ASCII characters are printed as '?'.
1534
1535 *******************************************************************************/
1536
1537 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1538 {
1539         char *endpos;                       /* points behind utf string           */
1540         char *utf_ptr;                      /* current position in utf text       */
1541
1542     if (!u)
1543                 return;
1544
1545         endpos = UTF_END(u);
1546         utf_ptr = u->text;
1547
1548         while (utf_ptr < endpos) { 
1549                 /* read next unicode character */                
1550                 u2 c = utf_nextu2(&utf_ptr);                            
1551                 if (c == '/') c = '.';
1552
1553                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1554                 else fprintf(file, "?");
1555         }
1556 }
1557
1558
1559 /* is_valid_utf ****************************************************************
1560
1561    Return true if the given string is a valid UTF-8 string.
1562
1563    utf_ptr...points to first character
1564    end_pos...points after last character
1565
1566 *******************************************************************************/
1567
1568 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1569
1570 bool is_valid_utf(char *utf_ptr, char *end_pos)
1571 {
1572         int bytes;
1573         int len,i;
1574         char c;
1575         unsigned long v;
1576
1577         if (end_pos < utf_ptr) return false;
1578         bytes = end_pos - utf_ptr;
1579         while (bytes--) {
1580                 c = *utf_ptr++;
1581
1582                 if (!c) return false;                     /* 0x00 is not allowed */
1583                 if ((c & 0x80) == 0) continue;            /* ASCII */
1584
1585                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1586                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1587                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1588                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1589                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1590                 else return false;                        /* invalid leading byte */
1591
1592                 if (len > 2) return false;                /* Java limitation */
1593
1594                 v = (unsigned long)c & (0x3f >> len);
1595                 
1596                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1597
1598                 for (i = len; i--; ) {
1599                         c = *utf_ptr++;
1600                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1601                                 return false;
1602                         v = (v << 6) | (c & 0x3f);
1603                 }
1604
1605                 if (v == 0) {
1606                         if (len != 1) return false;           /* Java special */
1607
1608                 } else {
1609                         /* Sun Java seems to allow overlong UTF-8 encodings */
1610                         
1611                         /* if (v < min_codepoint[len]) */
1612                                 /* XXX throw exception? */
1613                 }
1614
1615                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1616                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1617
1618                 /* even these seem to be allowed */
1619                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1620         }
1621
1622         return true;
1623 }
1624
1625
1626 /* is_valid_name ***************************************************************
1627
1628    Return true if the given string may be used as a class/field/method
1629    name. (Currently this only disallows empty strings and control
1630    characters.)
1631
1632    NOTE: The string is assumed to have passed is_valid_utf!
1633
1634    utf_ptr...points to first character
1635    end_pos...points after last character
1636
1637 *******************************************************************************/
1638
1639 bool is_valid_name(char *utf_ptr, char *end_pos)
1640 {
1641         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1642
1643         while (utf_ptr < end_pos) {
1644                 unsigned char c = *utf_ptr++;
1645
1646                 if (c < 0x20) return false; /* disallow control characters */
1647                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1648                         return false;
1649         }
1650
1651         return true;
1652 }
1653
1654 bool is_valid_name_utf(utf *u)
1655 {
1656         return is_valid_name(u->text, UTF_END(u));
1657 }
1658
1659
1660 /* utf_show ********************************************************************
1661
1662    Writes the utf symbols in the utfhash to stdout and displays the
1663    number of external hash chains grouped according to the chainlength
1664    (for debugging purposes).
1665
1666 *******************************************************************************/
1667
1668 #if !defined(NDEBUG)
1669 void utf_show(void)
1670 {
1671
1672 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1673
1674         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1675         u4 max_chainlength = 0;      /* maximum length of the chains */
1676         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1677         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1678         u4 i;
1679
1680         printf("UTF-HASH:\n");
1681
1682         /* show element of utf-hashtable */
1683
1684         for (i = 0; i < hashtable_utf->size; i++) {
1685                 utf *u = hashtable_utf->ptr[i];
1686
1687                 if (u) {
1688                         printf("SLOT %d: ", (int) i);
1689
1690                         while (u) {
1691                                 printf("'");
1692                                 utf_display_printable_ascii(u);
1693                                 printf("' ");
1694                                 u = u->hashlink;
1695                         }       
1696                         printf("\n");
1697                 }
1698         }
1699
1700         printf("UTF-HASH: %d slots for %d entries\n", 
1701                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1702
1703         if (hashtable_utf->entries == 0)
1704                 return;
1705
1706         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1707
1708         for (i=0;i<CHAIN_LIMIT;i++)
1709                 chain_count[i]=0;
1710
1711         /* count numbers of hashchains according to their length */
1712         for (i=0; i<hashtable_utf->size; i++) {
1713                   
1714                 utf *u = (utf*) hashtable_utf->ptr[i];
1715                 u4 chain_length = 0;
1716
1717                 /* determine chainlength */
1718                 while (u) {
1719                         u = u->hashlink;
1720                         chain_length++;
1721                 }
1722
1723                 /* update sum of all chainlengths */
1724                 sum_chainlength+=chain_length;
1725
1726                 /* determine the maximum length of the chains */
1727                 if (chain_length>max_chainlength)
1728                         max_chainlength = chain_length;
1729
1730                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1731                 if (chain_length>=CHAIN_LIMIT) {
1732                         beyond_limit+=chain_length;
1733                         chain_length=CHAIN_LIMIT-1;
1734                 }
1735
1736                 /* update number of hashchains of current length */
1737                 chain_count[chain_length]++;
1738         }
1739
1740         /* display results */  
1741         for (i=1;i<CHAIN_LIMIT-1;i++) 
1742                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1743           
1744         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1745
1746
1747         printf("max. chainlength:%5d\n",max_chainlength);
1748
1749         /* avg. chainlength = sum of chainlengths / number of chains */
1750         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1751 }
1752 #endif /* !defined(NDEBUG) */
1753
1754
1755 /*
1756  * These are local overrides for various environment variables in Emacs.
1757  * Please do not remove this and leave it at the end of the file, where
1758  * Emacs will automagically detect them.
1759  * ---------------------------------------------------------------------
1760  * Local variables:
1761  * mode: c
1762  * indent-tabs-mode: t
1763  * c-basic-offset: 4
1764  * tab-width: 4
1765  * End:
1766  * vim:noexpandtab:sw=4:ts=4:
1767  */