* src/vm/utf8.h, src/vm/utf8.c (utf8_safe_number_of_u2s): Modified to
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32                         Edwin Steiner
33
34    $Id: utf8.c 5823 2006-10-24 23:24:19Z edwin $
35
36 */
37
38
39 #include "config.h"
40
41 #include <string.h>
42 #include <assert.h>
43
44 #include "vm/types.h"
45
46 #include "mm/memory.h"
47
48 #if defined(ENABLE_THREADS)
49 # include "threads/native/lock.h"
50 #else
51 # include "threads/none/lock.h"
52 #endif
53
54 #include "vm/builtin.h"
55 #include "vm/exceptions.h"
56 #include "vm/hashtable.h"
57 #include "vm/options.h"
58 #include "vm/statistics.h"
59 #include "vm/stringlocal.h"
60 #include "vm/utf8.h"
61
62
63 /* global variables ***********************************************************/
64
65 /* hashsize must be power of 2 */
66
67 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
68
69 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
70
71
72 /* utf-symbols for pointer comparison of frequently used strings **************/
73
74 utf *utf_java_lang_Object;
75
76 utf *utf_java_lang_Class;
77 utf *utf_java_lang_ClassLoader;
78 utf *utf_java_lang_Cloneable;
79 utf *utf_java_lang_SecurityManager;
80 utf *utf_java_lang_String;
81 utf *utf_java_lang_System;
82 utf *utf_java_lang_ThreadGroup;
83 utf *utf_java_io_Serializable;
84
85 utf *utf_java_lang_Throwable;
86 utf *utf_java_lang_VMThrowable;
87 utf *utf_java_lang_Error;
88 utf *utf_java_lang_AbstractMethodError;
89 utf *utf_java_lang_LinkageError;
90 utf *utf_java_lang_NoClassDefFoundError;
91 utf *utf_java_lang_NoSuchMethodError;
92 utf *utf_java_lang_OutOfMemoryError;
93
94 utf *utf_java_lang_Exception;
95 utf *utf_java_lang_ClassCastException;
96 utf *utf_java_lang_ClassNotFoundException;
97 utf *utf_java_lang_IllegalArgumentException;
98 utf *utf_java_lang_IllegalMonitorStateException;
99
100 utf *utf_java_lang_NullPointerException;
101
102 utf* utf_java_lang_Void;
103 utf* utf_java_lang_Boolean;
104 utf* utf_java_lang_Byte;
105 utf* utf_java_lang_Character;
106 utf* utf_java_lang_Short;
107 utf* utf_java_lang_Integer;
108 utf* utf_java_lang_Long;
109 utf* utf_java_lang_Float;
110 utf* utf_java_lang_Double;
111
112 utf *utf_java_lang_StackTraceElement;
113 utf *utf_java_lang_reflect_Constructor;
114 utf *utf_java_lang_reflect_Field;
115 utf *utf_java_lang_reflect_Method;
116 utf *utf_java_util_Vector;
117
118 utf *utf_InnerClasses;                  /* InnerClasses                       */
119 utf *utf_ConstantValue;                 /* ConstantValue                      */
120 utf *utf_Code;                          /* Code                               */
121 utf *utf_Exceptions;                    /* Exceptions                         */
122 utf *utf_LineNumberTable;               /* LineNumberTable                    */
123 utf *utf_SourceFile;                    /* SourceFile                         */
124
125 utf *utf_init;                          /* <init>                             */
126 utf *utf_clinit;                        /* <clinit>                           */
127 utf *utf_clone;                         /* clone                              */
128 utf *utf_finalize;                      /* finalize                           */
129 utf *utf_run;                           /* run                                */
130
131 utf *utf_add;
132 utf *utf_remove;
133 utf *utf_removeThread;
134 utf *utf_put;
135 utf *utf_get;
136 utf *utf_value;
137
138 utf *utf_fillInStackTrace;
139 utf *utf_getSystemClassLoader;
140 utf *utf_loadClass;
141 utf *utf_printStackTrace;
142
143 utf *utf_Z;                             /* Z                                  */
144 utf *utf_B;                             /* B                                  */
145 utf *utf_C;                             /* C                                  */
146 utf *utf_S;                             /* S                                  */
147 utf *utf_I;                             /* I                                  */
148 utf *utf_J;                             /* J                                  */
149 utf *utf_F;                             /* F                                  */
150 utf *utf_D;                             /* D                                  */
151
152 utf *utf_void__void;                    /* ()V                                */
153 utf *utf_boolean__void;                 /* (Z)V                               */
154 utf *utf_byte__void;                    /* (B)V                               */
155 utf *utf_char__void;                    /* (C)V                               */
156 utf *utf_short__void;                   /* (S)V                               */
157 utf *utf_int__void;                     /* (I)V                               */
158 utf *utf_long__void;                    /* (J)V                               */
159 utf *utf_float__void;                   /* (F)V                               */
160 utf *utf_double__void;                  /* (D)V                               */
161
162 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
163 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
164 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
165 utf *utf_java_lang_Object__java_lang_Object;
166 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
167 utf *utf_java_lang_String__java_lang_Class;
168 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
169 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
170
171 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
172 utf *utf_null;
173 utf *array_packagename;
174
175
176 /* utf_init ********************************************************************
177
178    Initializes the utf8 subsystem.
179
180 *******************************************************************************/
181
182 bool utf8_init(void)
183 {
184         /* create utf8 hashtable */
185
186         hashtable_utf = NEW(hashtable);
187
188         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
189
190 #if defined(ENABLE_STATISTICS)
191         if (opt_stat)
192                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
193 #endif
194
195         /* create utf-symbols for pointer comparison of frequently used strings */
196
197         utf_java_lang_Object           = utf_new_char("java/lang/Object");
198
199         utf_java_lang_Class            = utf_new_char("java/lang/Class");
200         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
201         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
202         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
203         utf_java_lang_String           = utf_new_char("java/lang/String");
204         utf_java_lang_System           = utf_new_char("java/lang/System");
205         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
206         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
207
208         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
209         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
210         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
211
212         utf_java_lang_AbstractMethodError =
213                 utf_new_char(string_java_lang_AbstractMethodError);
214
215         utf_java_lang_LinkageError =
216                 utf_new_char(string_java_lang_LinkageError);
217
218         utf_java_lang_NoClassDefFoundError =
219                 utf_new_char(string_java_lang_NoClassDefFoundError);
220
221         utf_java_lang_NoSuchMethodError =
222                 utf_new_char(string_java_lang_NoSuchMethodError);
223
224         utf_java_lang_OutOfMemoryError =
225                 utf_new_char(string_java_lang_OutOfMemoryError);
226
227         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
228
229         utf_java_lang_ClassCastException =
230                 utf_new_char(string_java_lang_ClassCastException);
231
232         utf_java_lang_ClassNotFoundException =
233                 utf_new_char(string_java_lang_ClassNotFoundException);
234
235         utf_java_lang_IllegalArgumentException =
236                 utf_new_char(string_java_lang_IllegalArgumentException);
237
238         utf_java_lang_IllegalMonitorStateException =
239                 utf_new_char(string_java_lang_IllegalMonitorStateException);
240
241         utf_java_lang_NullPointerException =
242                 utf_new_char(string_java_lang_NullPointerException);
243
244         utf_java_lang_Void             = utf_new_char("java/lang/Void");
245         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
246         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
247         utf_java_lang_Character        = utf_new_char("java/lang/Character");
248         utf_java_lang_Short            = utf_new_char("java/lang/Short");
249         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
250         utf_java_lang_Long             = utf_new_char("java/lang/Long");
251         utf_java_lang_Float            = utf_new_char("java/lang/Float");
252         utf_java_lang_Double           = utf_new_char("java/lang/Double");
253
254         utf_java_lang_StackTraceElement =
255                 utf_new_char("java/lang/StackTraceElement");
256
257         utf_java_lang_reflect_Constructor =
258                 utf_new_char("java/lang/reflect/Constructor");
259
260         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
261         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
262         utf_java_util_Vector           = utf_new_char("java/util/Vector");
263
264         utf_InnerClasses               = utf_new_char("InnerClasses");
265         utf_ConstantValue              = utf_new_char("ConstantValue");
266         utf_Code                       = utf_new_char("Code");
267         utf_Exceptions                 = utf_new_char("Exceptions");
268         utf_LineNumberTable            = utf_new_char("LineNumberTable");
269         utf_SourceFile                 = utf_new_char("SourceFile");
270
271         utf_init                           = utf_new_char("<init>");
272         utf_clinit                         = utf_new_char("<clinit>");
273         utf_clone                      = utf_new_char("clone");
274         utf_finalize                   = utf_new_char("finalize");
275         utf_run                        = utf_new_char("run");
276
277         utf_add                        = utf_new_char("add");
278         utf_remove                     = utf_new_char("remove");
279         utf_removeThread               = utf_new_char("removeThread");
280         utf_put                        = utf_new_char("put");
281         utf_get                        = utf_new_char("get");
282         utf_value                      = utf_new_char("value");
283
284         utf_printStackTrace            = utf_new_char("printStackTrace");
285         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
286         utf_loadClass                  = utf_new_char("loadClass");
287         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
288
289         utf_Z                          = utf_new_char("Z");
290         utf_B                          = utf_new_char("B");
291         utf_C                          = utf_new_char("C");
292         utf_S                          = utf_new_char("S");
293         utf_I                          = utf_new_char("I");
294         utf_J                          = utf_new_char("J");
295         utf_F                          = utf_new_char("F");
296         utf_D                          = utf_new_char("D");
297
298         utf_void__void                 = utf_new_char("()V");
299         utf_boolean__void              = utf_new_char("(Z)V");
300         utf_byte__void                 = utf_new_char("(B)V");
301         utf_char__void                 = utf_new_char("(C)V");
302         utf_short__void                = utf_new_char("(S)V");
303         utf_int__void                  = utf_new_char("(I)V");
304         utf_long__void                 = utf_new_char("(J)V");
305         utf_float__void                = utf_new_char("(F)V");
306         utf_double__void               = utf_new_char("(D)V");
307         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
308         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
309
310         utf_void__java_lang_ClassLoader =
311                 utf_new_char("()Ljava/lang/ClassLoader;");
312
313         utf_java_lang_Object__java_lang_Object =
314                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
315
316         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
317
318         utf_java_lang_String__java_lang_Class =
319                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
320
321         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
322         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
323
324         utf_null                       = utf_new_char("null");
325         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
326         array_packagename              = utf_new_char("\t<the array package>");
327
328         /* everything's ok */
329
330         return true;
331 }
332
333
334 /* utf_hashkey *****************************************************************
335
336    The hashkey is computed from the utf-text by using up to 8
337    characters.  For utf-symbols longer than 15 characters 3 characters
338    are taken from the beginning and the end, 2 characters are taken
339    from the middle.
340
341 *******************************************************************************/
342
343 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
344 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
345
346 u4 utf_hashkey(const char *text, u4 length)
347 {
348         const char *start_pos = text;       /* pointer to utf text                */
349         u4 a;
350
351         switch (length) {
352         case 0: /* empty string */
353                 return 0;
354
355         case 1: return fbs(0);
356         case 2: return fbs(0) ^ nbs(3);
357         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
358         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
359         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
360         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
361         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
362         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
363
364         case 9:
365                 a = fbs(0);
366                 a ^= nbs(1);
367                 a ^= nbs(2);
368                 text++;
369                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
370
371         case 10:
372                 a = fbs(0);
373                 text++;
374                 a ^= nbs(2);
375                 a ^= nbs(3);
376                 a ^= nbs(4);
377                 text++;
378                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
379
380         case 11:
381                 a = fbs(0);
382                 text++;
383                 a ^= nbs(2);
384                 a ^= nbs(3);
385                 a ^= nbs(4);
386                 text++;
387                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
388
389         case 12:
390                 a = fbs(0);
391                 text += 2;
392                 a ^= nbs(2);
393                 a ^= nbs(3);
394                 text++;
395                 a ^= nbs(5);
396                 a ^= nbs(6);
397                 a ^= nbs(7);
398                 text++;
399                 return a ^ nbs(9) ^ nbs(10);
400
401         case 13:
402                 a = fbs(0);
403                 a ^= nbs(1);
404                 text++;
405                 a ^= nbs(3);
406                 a ^= nbs(4);
407                 text += 2;      
408                 a ^= nbs(7);
409                 a ^= nbs(8);
410                 text += 2;
411                 return a ^ nbs(9) ^ nbs(10);
412
413         case 14:
414                 a = fbs(0);
415                 text += 2;      
416                 a ^= nbs(3);
417                 a ^= nbs(4);
418                 text += 2;      
419                 a ^= nbs(7);
420                 a ^= nbs(8);
421                 text += 2;
422                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
423
424         case 15:
425                 a = fbs(0);
426                 text += 2;      
427                 a ^= nbs(3);
428                 a ^= nbs(4);
429                 text += 2;      
430                 a ^= nbs(7);
431                 a ^= nbs(8);
432                 text += 2;
433                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
434
435         default:  /* 3 characters from beginning */
436                 a = fbs(0);
437                 text += 2;
438                 a ^= nbs(3);
439                 a ^= nbs(4);
440
441                 /* 2 characters from middle */
442                 text = start_pos + (length / 2);
443                 a ^= fbs(5);
444                 text += 2;
445                 a ^= nbs(6);    
446
447                 /* 3 characters from end */
448                 text = start_pos + length - 4;
449
450                 a ^= fbs(7);
451                 text++;
452
453                 return a ^ nbs(10) ^ nbs(11);
454     }
455 }
456
457 /* utf_full_hashkey ************************************************************
458
459    This function computes a hash value using all bytes in the string.
460
461    The algorithm is the "One-at-a-time" algorithm as published
462    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
463
464 *******************************************************************************/
465
466 u4 utf_full_hashkey(const char *text, u4 length)
467 {
468         register const unsigned char *p = (const unsigned char *) text;
469         register u4 hash;
470         register u4 i;
471
472         hash = 0;
473         for (i=length; i--;)
474         {
475             hash += *p++;
476             hash += (hash << 10);
477             hash ^= (hash >> 6);
478         }
479         hash += (hash << 3);
480         hash ^= (hash >> 11);
481         hash += (hash << 15);
482
483         return hash;
484 }
485
486 /* unicode_hashkey *************************************************************
487
488    Compute the hashkey of a unicode string.
489
490 *******************************************************************************/
491
492 u4 unicode_hashkey(u2 *text, u2 len)
493 {
494         return utf_hashkey((char *) text, len);
495 }
496
497
498 /* utf_new *********************************************************************
499
500    Creates a new utf-symbol, the text of the symbol is passed as a
501    u1-array. The function searches the utf-hashtable for a utf-symbol
502    with this text. On success the element returned, otherwise a new
503    hashtable element is created.
504
505    If the number of entries in the hashtable exceeds twice the size of
506    the hashtable slots a reorganization of the hashtable is done and
507    the utf symbols are copied to a new hashtable with doubled size.
508
509 *******************************************************************************/
510
511 utf *utf_new(const char *text, u2 length)
512 {
513         u4 key;                             /* hashkey computed from utf-text     */
514         u4 slot;                            /* slot in hashtable                  */
515         utf *u;                             /* hashtable element                  */
516         u2 i;
517
518         LOCK_MONITOR_ENTER(hashtable_utf->header);
519
520 #if defined(ENABLE_STATISTICS)
521         if (opt_stat)
522                 count_utf_new++;
523 #endif
524
525         key  = utf_hashkey(text, length);
526         slot = key & (hashtable_utf->size - 1);
527         u    = hashtable_utf->ptr[slot];
528
529         /* search external hash chain for utf-symbol */
530
531         while (u) {
532                 if (u->blength == length) {
533                         /* compare text of hashtable elements */
534
535                         for (i = 0; i < length; i++)
536                                 if (text[i] != u->text[i])
537                                         goto nomatch;
538                         
539 #if defined(ENABLE_STATISTICS)
540                         if (opt_stat)
541                                 count_utf_new_found++;
542 #endif
543
544                         /* symbol found in hashtable */
545
546                         LOCK_MONITOR_EXIT(hashtable_utf->header);
547
548                         return u;
549                 }
550
551         nomatch:
552                 u = u->hashlink; /* next element in external chain */
553         }
554
555 #if defined(ENABLE_STATISTICS)
556         if (opt_stat)
557                 count_utf_len += sizeof(utf) + length + 1;
558 #endif
559
560         /* location in hashtable found, create new utf element */
561         u = NEW(utf);
562         u->blength  = length;               /* length in bytes of utfstring       */
563         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
564         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
565
566         memcpy(u->text, text, length);      /* copy utf-text                      */
567         u->text[length] = '\0';
568
569         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
570         hashtable_utf->entries++;           /* update number of entries           */
571
572         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
573
574         /* reorganization of hashtable, average length of the external
575            chains is approx. 2 */
576
577                 hashtable *newhash;                              /* the new hashtable */
578                 u4         i;
579                 utf       *u;
580                 utf       *nextu;
581                 u4         slot;
582
583                 /* create new hashtable, double the size */
584
585                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
586
587 #if defined(ENABLE_STATISTICS)
588                 if (opt_stat)
589                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
590 #endif
591
592                 /* transfer elements to new hashtable */
593
594                 for (i = 0; i < hashtable_utf->size; i++) {
595                         u = hashtable_utf->ptr[i];
596
597                         while (u) {
598                                 nextu = u->hashlink;
599                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
600                                                 
601                                 u->hashlink = (utf *) newhash->ptr[slot];
602                                 newhash->ptr[slot] = u;
603
604                                 /* follow link in external hash chain */
605
606                                 u = nextu;
607                         }
608                 }
609         
610                 /* dispose old table */
611
612                 hashtable_free(hashtable_utf);
613
614                 hashtable_utf = newhash;
615         }
616
617         LOCK_MONITOR_EXIT(hashtable_utf->header);
618
619         return u;
620 }
621
622
623 /* utf_new_u2 ******************************************************************
624
625    Make utf symbol from u2 array, if isclassname is true '.' is
626    replaced by '/'.
627
628 *******************************************************************************/
629
630 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
631 {
632         char *buffer;                   /* memory buffer for  unicode characters  */
633         char *pos;                      /* pointer to current position in buffer  */
634         u4 left;                        /* unicode characters left                */
635         u4 buflength;                   /* utf length in bytes of the u2 array    */
636         utf *result;                    /* resulting utf-string                   */
637         int i;          
638
639         /* determine utf length in bytes and allocate memory */
640
641         buflength = u2_utflength(unicode_pos, unicode_length); 
642         buffer    = MNEW(char, buflength);
643  
644         left = buflength;
645         pos  = buffer;
646
647         for (i = 0; i++ < unicode_length; unicode_pos++) {
648                 /* next unicode character */
649                 u2 c = *unicode_pos;
650                 
651                 if ((c != 0) && (c < 0x80)) {
652                         /* 1 character */       
653                         left--;
654                 if ((int) left < 0) break;
655                         /* convert classname */
656                         if (isclassname && c == '.')
657                                 *pos++ = '/';
658                         else
659                                 *pos++ = (char) c;
660
661                 } else if (c < 0x800) {             
662                         /* 2 characters */                              
663                 unsigned char high = c >> 6;
664                 unsigned char low  = c & 0x3F;
665                         left = left - 2;
666                 if ((int) left < 0) break;
667                 *pos++ = high | 0xC0; 
668                 *pos++ = low  | 0x80;     
669
670                 } else {         
671                 /* 3 characters */                              
672                 char low  = c & 0x3f;
673                 char mid  = (c >> 6) & 0x3F;
674                 char high = c >> 12;
675                         left = left - 3;
676                 if ((int) left < 0) break;
677                 *pos++ = high | 0xE0; 
678                 *pos++ = mid  | 0x80;  
679                 *pos++ = low  | 0x80;   
680                 }
681         }
682         
683         /* insert utf-string into symbol-table */
684         result = utf_new(buffer,buflength);
685
686         MFREE(buffer, char, buflength);
687
688         return result;
689 }
690
691
692 /* utf_new_char ****************************************************************
693
694    Creates a new utf symbol, the text for this symbol is passed as a
695    c-string ( = char* ).
696
697 *******************************************************************************/
698
699 utf *utf_new_char(const char *text)
700 {
701         return utf_new(text, strlen(text));
702 }
703
704
705 /* utf_new_char_classname ******************************************************
706
707    Creates a new utf symbol, the text for this symbol is passed as a
708    c-string ( = char* ) "." characters are going to be replaced by
709    "/". Since the above function is used often, this is a separte
710    function, instead of an if.
711
712 *******************************************************************************/
713
714 utf *utf_new_char_classname(const char *text)
715 {
716         if (strchr(text, '.')) {
717                 char *txt = strdup(text);
718                 char *end = txt + strlen(txt);
719                 char *c;
720                 utf *tmpRes;
721
722                 for (c = txt; c < end; c++)
723                         if (*c == '.') *c = '/';
724
725                 tmpRes = utf_new(txt, strlen(txt));
726                 FREE(txt, 0);
727
728                 return tmpRes;
729
730         } else
731                 return utf_new(text, strlen(text));
732 }
733
734
735 /* utf_nextu2 ******************************************************************
736
737    Read the next unicode character from the utf string and increment
738    the utf-string pointer accordingly.
739
740    CAUTION: This function is unsafe for input that was not checked 
741             by is_valid_utf!
742
743 *******************************************************************************/
744
745 u2 utf_nextu2(char **utf_ptr)
746 {
747     /* uncompressed unicode character */
748     u2 unicode_char = 0;
749     /* current position in utf text */  
750     unsigned char *utf = (unsigned char *) (*utf_ptr);
751     /* bytes representing the unicode character */
752     unsigned char ch1, ch2, ch3;
753     /* number of bytes used to represent the unicode character */
754     int len = 0;
755         
756     switch ((ch1 = utf[0]) >> 4) {
757         default: /* 1 byte */
758                 (*utf_ptr)++;
759                 return (u2) ch1;
760         case 0xC: 
761         case 0xD: /* 2 bytes */
762                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
763                         unsigned char high = ch1 & 0x1F;
764                         unsigned char low  = ch2 & 0x3F;
765                         unicode_char = (high << 6) + low;
766                         len = 2;
767                 }
768                 break;
769
770         case 0xE: /* 2 or 3 bytes */
771                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
772                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
773                                 unsigned char low  = ch3 & 0x3f;
774                                 unsigned char mid  = ch2 & 0x3f;
775                                 unsigned char high = ch1 & 0x0f;
776                                 unicode_char = (((high << 6) + mid) << 6) + low;
777                                 len = 3;
778                         } else
779                                 len = 2;                                           
780                 }
781                 break;
782     }
783
784     /* update position in utf-text */
785     *utf_ptr = (char *) (utf + len);
786
787     return unicode_char;
788 }
789
790
791 /* utf_bytes *******************************************************************
792
793    Determine number of bytes (aka. octets) in the utf string.
794
795    IN:
796       u............utf string
797
798    OUT:
799       The number of octets of this utf string.
800           There is _no_ terminating zero included in this count.
801
802 *******************************************************************************/
803
804 u4 utf_bytes(utf *u)
805 {
806         return u->blength;
807 }
808
809 /* utf_get_number_of_u2s_for_buffer ********************************************
810
811    Determine number of UTF-16 u2s in the given UTF-8 buffer
812
813    CAUTION: This function is unsafe for input that was not checked 
814             by is_valid_utf!
815
816    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
817    to an array of u2s (UTF-16) and want to know how many of them you will get.
818    All other uses of this function are probably wrong.
819
820    IN:
821       buffer........points to first char in buffer
822           blength.......number of _bytes_ in the buffer
823
824    OUT:
825       the number of u2s needed to hold this string in UTF-16 encoding.
826           There is _no_ terminating zero included in this count.
827
828    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
829    exception.
830
831 *******************************************************************************/
832
833 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
834 {
835         const char *endpos;                 /* points behind utf string           */
836         const char *utf_ptr;                /* current position in utf text       */
837         u4 len = 0;                         /* number of unicode characters       */
838
839         utf_ptr = buffer;
840         endpos = utf_ptr + blength;
841
842         while (utf_ptr < endpos) {
843                 len++;
844                 /* next unicode character */
845                 utf_nextu2((char **)&utf_ptr);
846         }
847
848         assert(utf_ptr == endpos);
849
850         return len;
851 }
852
853
854 /* utf_get_number_of_u2s *******************************************************
855
856    Determine number of UTF-16 u2s in the utf string.
857
858    CAUTION: This function is unsafe for input that was not checked 
859             by is_valid_utf!
860
861    CAUTION: Use this function *only* when you want to convert a utf string
862    to an array of u2s and want to know how many of them you will get.
863    All other uses of this function are probably wrong.
864
865    IN:
866       u............utf string
867
868    OUT:
869       the number of u2s needed to hold this string in UTF-16 encoding.
870           There is _no_ terminating zero included in this count.
871           XXX 0 if a NullPointerException has been thrown (see below)
872
873 *******************************************************************************/
874
875 u4 utf_get_number_of_u2s(utf *u)
876 {
877         char *endpos;                       /* points behind utf string           */
878         char *utf_ptr;                      /* current position in utf text       */
879         u4 len = 0;                         /* number of unicode characters       */
880
881         /* XXX this is probably not checked by most callers! Review this after */
882         /* the invalid uses of this function have been eliminated */
883         if (!u) {
884                 exceptions_throw_nullpointerexception();
885                 return 0;
886         }
887
888         endpos = UTF_END(u);
889         utf_ptr = u->text;
890
891         while (utf_ptr < endpos) {
892                 len++;
893                 /* next unicode character */
894                 utf_nextu2(&utf_ptr);
895         }
896
897         if (utf_ptr != endpos)
898                 /* string ended abruptly */
899                 throw_cacao_exception_exit(string_java_lang_InternalError,
900                                                                    "Illegal utf8 string");
901
902         return len;
903 }
904
905
906 /* utf8_safe_number_of_u2s *****************************************************
907
908    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
909    (For invalid UTF-8 the U+fffd replacement character will be counted.)
910
911    This function is safe even for invalid UTF-8 strings.
912
913    IN:
914       text..........zero-terminated(!) UTF-8 string (may be invalid)
915                         must NOT be NULL
916           nbytes........strlen(text). (This is needed to completely emulate
917                         the RI).
918
919    OUT:
920       the number of u2s needed to hold this string in UTF-16 encoding.
921           There is _no_ terminating zero included in this count.
922
923 *******************************************************************************/
924
925 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
926         register const unsigned char *t;
927         register s4 byte;
928         register s4 len;
929         register const unsigned char *tlimit;
930         s4 byte1;
931         s4 byte2;
932         s4 byte3;
933         s4 value;
934         s4 skip;
935
936         assert(text);
937         assert(nbytes >= 0);
938
939         len = 0;
940         t = (const unsigned char *) text;
941         tlimit = t + nbytes;
942
943         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
944
945         while (1) {
946                 byte = *t++;
947
948                 if (byte & 0x80) {
949                         /* highest bit set, non-ASCII character */
950
951                         if ((byte & 0xe0) == 0xc0) {
952                                 /* 2-byte: should be 110..... 10...... ? */
953
954                                 if ((*t++ & 0xc0) == 0x80)
955                                         ; /* valid 2-byte */
956                                 else
957                                         t--; /* invalid */
958                         }
959                         else if ((byte & 0xf0) == 0xe0) {
960                                 /* 3-byte: should be 1110.... 10...... 10...... */
961                                 /*                            ^t                */
962
963                                 if (t + 2 > tlimit)
964                                         return len + 1; /* invalid, stop here */
965
966                                 if ((*t++ & 0xc0) == 0x80) {
967                                         if ((*t++ & 0xc0) == 0x80)
968                                                 ; /* valid 3-byte */
969                                         else
970                                                 t--; /* invalid */
971                                 }
972                                 else
973                                         t--; /* invalid */
974                         }
975                         else if ((byte & 0xf8) == 0xf0) {
976                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
977                                 /*                            ^t                         */
978
979                                 if (t + 3 > tlimit)
980                                         return len + 1; /* invalid, stop here */
981
982                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
983                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
984                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
985                                                         /* valid 4-byte UTF-8? */
986                                                         value = ((byte  & 0x07) << 18)
987                                                                   | ((byte1 & 0x3f) << 12)
988                                                                   | ((byte2 & 0x3f) <<  6)
989                                                                   | ((byte3 & 0x3f)      );
990
991                                                         if (value > 0x10FFFF)
992                                                                 ; /* invalid */
993                                                         else if (value > 0xFFFF)
994                                                                 len += 1; /* we need surrogates */
995                                                         else
996                                                                 ; /* 16bit suffice */
997                                                 }
998                                                 else
999                                                         t--; /* invalid */
1000                                         }
1001                                         else
1002                                                 t--; /* invalid */
1003                                 }
1004                                 else
1005                                         t--; /* invalid */
1006                         }
1007                         else if ((byte & 0xfc) == 0xf8) {
1008                                 /* invalid 5-byte */
1009                                 if (t + 4 > tlimit)
1010                                         return len + 1; /* invalid, stop here */
1011
1012                                 skip = 4;
1013                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1014                                         t++;
1015                         }
1016                         else if ((byte & 0xfe) == 0xfc) {
1017                                 /* invalid 6-byte */
1018                                 if (t + 5 > tlimit)
1019                                         return len + 1; /* invalid, stop here */
1020
1021                                 skip = 5;
1022                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1023                                         t++;
1024                         }
1025                         else
1026                                 ; /* invalid */
1027                 }
1028                 else {
1029                         /* NUL */
1030
1031                         if (byte == 0)
1032                                 break;
1033
1034                         /* ASCII character, common case */
1035                 }
1036
1037                 len++;
1038         }
1039
1040         return len;
1041 }
1042
1043
1044 /* utf8_safe_convert_to_u2s ****************************************************
1045
1046    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1047    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1048    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1049
1050    This function is safe even for invalid UTF-8 strings.
1051
1052    IN:
1053       text..........zero-terminated(!) UTF-8 string (may be invalid)
1054                         must NOT be NULL
1055           nbytes........strlen(text). (This is needed to completely emulate
1056                                         the RI).
1057           buffer........a preallocated array of u2s to receive the decoded
1058                         string. Use utf8_safe_number_of_u2s to get the
1059                                         required number of u2s for allocating this.
1060
1061 *******************************************************************************/
1062
1063 #define UNICODE_REPLACEMENT  0xfffd
1064
1065 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1066         register const unsigned char *t;
1067         register s4 byte;
1068         register const unsigned char *tlimit;
1069         s4 byte1;
1070         s4 byte2;
1071         s4 byte3;
1072         s4 value;
1073         s4 skip;
1074
1075         assert(text);
1076         assert(nbytes >= 0);
1077
1078         t = (const unsigned char *) text;
1079         tlimit = t + nbytes;
1080
1081         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1082
1083         while (1) {
1084                 byte = *t++;
1085
1086                 if (byte & 0x80) {
1087                         /* highest bit set, non-ASCII character */
1088
1089                         if ((byte & 0xe0) == 0xc0) {
1090                                 /* 2-byte: should be 110..... 10...... */
1091
1092                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1093                                         /* valid 2-byte UTF-8 */
1094                                         *buffer++ = ((byte  & 0x1f) << 6)
1095                                                           | ((byte1 & 0x3f)     );
1096                                 }
1097                                 else {
1098                                         *buffer++ = UNICODE_REPLACEMENT;
1099                                         t--;
1100                                 }
1101                         }
1102                         else if ((byte & 0xf0) == 0xe0) {
1103                                 /* 3-byte: should be 1110.... 10...... 10...... */
1104
1105                                 if (t + 2 > tlimit) {
1106                                         *buffer++ = UNICODE_REPLACEMENT;
1107                                         return;
1108                                 }
1109
1110                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1111                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1112                                                 /* valid 3-byte UTF-8 */
1113                                                 *buffer++ = ((byte  & 0x0f) << 12)
1114                                                                   | ((byte1 & 0x3f) <<  6)
1115                                                                   | ((byte2 & 0x3f)      );
1116                                         }
1117                                         else {
1118                                                 *buffer++ = UNICODE_REPLACEMENT;
1119                                                 t--;
1120                                         }
1121                                 }
1122                                 else {
1123                                         *buffer++ = UNICODE_REPLACEMENT;
1124                                         t--;
1125                                 }
1126                         }
1127                         else if ((byte & 0xf8) == 0xf0) {
1128                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1129
1130                                 if (t + 3 > tlimit) {
1131                                         *buffer++ = UNICODE_REPLACEMENT;
1132                                         return;
1133                                 }
1134
1135                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1136                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1137                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1138                                                         /* valid 4-byte UTF-8? */
1139                                                         value = ((byte  & 0x07) << 18)
1140                                                                   | ((byte1 & 0x3f) << 12)
1141                                                                   | ((byte2 & 0x3f) <<  6)
1142                                                                   | ((byte3 & 0x3f)      );
1143
1144                                                         if (value > 0x10FFFF) {
1145                                                                 *buffer++ = UNICODE_REPLACEMENT;
1146                                                         }
1147                                                         else if (value > 0xFFFF) {
1148                                                                 /* we need surrogates */
1149                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1150                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1151                                                         }
1152                                                         else
1153                                                                 *buffer++ = value; /* 16bit suffice */
1154                                                 }
1155                                                 else {
1156                                                         *buffer++ = UNICODE_REPLACEMENT;
1157                                                         t--;
1158                                                 }
1159                                         }
1160                                         else {
1161                                                 *buffer++ = UNICODE_REPLACEMENT;
1162                                                 t--;
1163                                         }
1164                                 }
1165                                 else {
1166                                         *buffer++ = UNICODE_REPLACEMENT;
1167                                         t--;
1168                                 }
1169                         }
1170                         else if ((byte & 0xfc) == 0xf8) {
1171                                 if (t + 4 > tlimit) {
1172                                         *buffer++ = UNICODE_REPLACEMENT;
1173                                         return;
1174                                 }
1175
1176                                 skip = 4;
1177                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1178                                         t++;
1179                                 *buffer++ = UNICODE_REPLACEMENT;
1180                         }
1181                         else if ((byte & 0xfe) == 0xfc) {
1182                                 if (t + 5 > tlimit) {
1183                                         *buffer++ = UNICODE_REPLACEMENT;
1184                                         return;
1185                                 }
1186
1187                                 skip = 5;
1188                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1189                                         t++;
1190                                 *buffer++ = UNICODE_REPLACEMENT;
1191                         }
1192                         else
1193                                 *buffer++ = UNICODE_REPLACEMENT;
1194                 }
1195                 else {
1196                         /* NUL */
1197
1198                         if (byte == 0)
1199                                 break;
1200
1201                         /* ASCII character, common case */
1202
1203                         *buffer++ = byte;
1204                 }
1205         }
1206 }
1207
1208
1209 /* u2_utflength ****************************************************************
1210
1211    Returns the utf length in bytes of a u2 array.
1212
1213 *******************************************************************************/
1214
1215 u4 u2_utflength(u2 *text, u4 u2_length)
1216 {
1217         u4 result_len = 0;                  /* utf length in bytes                */
1218         u2 ch;                              /* current unicode character          */
1219         u4 len;
1220         
1221         for (len = 0; len < u2_length; len++) {
1222                 /* next unicode character */
1223                 ch = *text++;
1224           
1225                 /* determine bytes required to store unicode character as utf */
1226                 if (ch && (ch < 0x80)) 
1227                         result_len++;
1228                 else if (ch < 0x800)
1229                         result_len += 2;        
1230                 else 
1231                         result_len += 3;        
1232         }
1233
1234     return result_len;
1235 }
1236
1237
1238 /* utf_copy ********************************************************************
1239
1240    Copy the given utf string byte-for-byte to a buffer.
1241
1242    IN:
1243       buffer.......the buffer
1244           u............the utf string
1245
1246 *******************************************************************************/
1247
1248 void utf_copy(char *buffer, utf *u)
1249 {
1250         /* our utf strings are zero-terminated (done by utf_new) */
1251         MCOPY(buffer, u->text, char, u->blength + 1);
1252 }
1253
1254
1255 /* utf_cat *********************************************************************
1256
1257    Append the given utf string byte-for-byte to a buffer.
1258
1259    IN:
1260       buffer.......the buffer
1261           u............the utf string
1262
1263 *******************************************************************************/
1264
1265 void utf_cat(char *buffer, utf *u)
1266 {
1267         /* our utf strings are zero-terminated (done by utf_new) */
1268         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1269 }
1270
1271
1272 /* utf_copy_classname **********************************************************
1273
1274    Copy the given utf classname byte-for-byte to a buffer.
1275    '/' is replaced by '.'
1276
1277    IN:
1278       buffer.......the buffer
1279           u............the utf string
1280
1281 *******************************************************************************/
1282
1283 void utf_copy_classname(char *buffer, utf *u)
1284 {
1285         char *bufptr;
1286         char *srcptr;
1287         char *endptr;
1288         char ch;
1289
1290         bufptr = buffer;
1291         srcptr = u->text;
1292         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1293
1294         while (srcptr != endptr) {
1295                 ch = *srcptr++;
1296                 if (ch == '/')
1297                         ch = '.';
1298                 *bufptr++ = ch;
1299         }
1300 }
1301
1302
1303 /* utf_cat *********************************************************************
1304
1305    Append the given utf classname byte-for-byte to a buffer.
1306    '/' is replaced by '.'
1307
1308    IN:
1309       buffer.......the buffer
1310           u............the utf string
1311
1312 *******************************************************************************/
1313
1314 void utf_cat_classname(char *buffer, utf *u)
1315 {
1316         utf_copy_classname(buffer + strlen(buffer), u);
1317 }
1318
1319 /* utf_display_printable_ascii *************************************************
1320
1321    Write utf symbol to stdout (for debugging purposes).
1322    Non-printable and non-ASCII characters are printed as '?'.
1323
1324 *******************************************************************************/
1325
1326 void utf_display_printable_ascii(utf *u)
1327 {
1328         char *endpos;                       /* points behind utf string           */
1329         char *utf_ptr;                      /* current position in utf text       */
1330
1331         if (u == NULL) {
1332                 printf("NULL");
1333                 fflush(stdout);
1334                 return;
1335         }
1336
1337         endpos = UTF_END(u);
1338         utf_ptr = u->text;
1339
1340         while (utf_ptr < endpos) {
1341                 /* read next unicode character */
1342
1343                 u2 c = utf_nextu2(&utf_ptr);
1344
1345                 if ((c >= 32) && (c <= 127))
1346                         printf("%c", c);
1347                 else
1348                         printf("?");
1349         }
1350
1351         fflush(stdout);
1352 }
1353
1354
1355 /* utf_display_printable_ascii_classname ***************************************
1356
1357    Write utf symbol to stdout with `/' converted to `.' (for debugging
1358    purposes).
1359    Non-printable and non-ASCII characters are printed as '?'.
1360
1361 *******************************************************************************/
1362
1363 void utf_display_printable_ascii_classname(utf *u)
1364 {
1365         char *endpos;                       /* points behind utf string           */
1366         char *utf_ptr;                      /* current position in utf text       */
1367
1368         if (u == NULL) {
1369                 printf("NULL");
1370                 fflush(stdout);
1371                 return;
1372         }
1373
1374         endpos = UTF_END(u);
1375         utf_ptr = u->text;
1376
1377         while (utf_ptr < endpos) {
1378                 /* read next unicode character */
1379
1380                 u2 c = utf_nextu2(&utf_ptr);
1381
1382                 if (c == '/')
1383                         c = '.';
1384
1385                 if ((c >= 32) && (c <= 127))
1386                         printf("%c", c);
1387                 else
1388                         printf("?");
1389         }
1390
1391         fflush(stdout);
1392 }
1393
1394
1395 /* utf_sprint_convert_to_latin1 ************************************************
1396         
1397    Write utf symbol into c-string (for debugging purposes).
1398    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1399    invalid results.
1400
1401 *******************************************************************************/
1402
1403 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1404 {
1405         char *endpos;                       /* points behind utf string           */
1406         char *utf_ptr;                      /* current position in utf text       */
1407         u2 pos = 0;                         /* position in c-string               */
1408
1409         if (!u) {
1410                 strcpy(buffer, "NULL");
1411                 return;
1412         }
1413
1414         endpos = UTF_END(u);
1415         utf_ptr = u->text;
1416
1417         while (utf_ptr < endpos) 
1418                 /* copy next unicode character */       
1419                 buffer[pos++] = utf_nextu2(&utf_ptr);
1420
1421         /* terminate string */
1422         buffer[pos] = '\0';
1423 }
1424
1425
1426 /* utf_sprint_convert_to_latin1_classname **************************************
1427         
1428    Write utf symbol into c-string with `/' converted to `.' (for debugging
1429    purposes).
1430    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1431    invalid results.
1432
1433 *******************************************************************************/
1434
1435 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1436 {
1437         char *endpos;                       /* points behind utf string           */
1438         char *utf_ptr;                      /* current position in utf text       */
1439         u2 pos = 0;                         /* position in c-string               */
1440
1441         if (!u) {
1442                 strcpy(buffer, "NULL");
1443                 return;
1444         }
1445
1446         endpos = UTF_END(u);
1447         utf_ptr = u->text;
1448
1449         while (utf_ptr < endpos) {
1450                 /* copy next unicode character */       
1451                 u2 c = utf_nextu2(&utf_ptr);
1452                 if (c == '/') c = '.';
1453                 buffer[pos++] = c;
1454         }
1455
1456         /* terminate string */
1457         buffer[pos] = '\0';
1458 }
1459
1460
1461 /* utf_strcat_convert_to_latin1 ************************************************
1462         
1463    Like libc strcat, but uses an utf8 string.
1464    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1465    invalid results.
1466
1467 *******************************************************************************/
1468
1469 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1470 {
1471         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1472 }
1473
1474
1475 /* utf_strcat_convert_to_latin1_classname **************************************
1476         
1477    Like libc strcat, but uses an utf8 string.
1478    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1479    invalid results.
1480
1481 *******************************************************************************/
1482
1483 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1484 {
1485         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1486 }
1487
1488
1489 /* utf_fprint_printable_ascii **************************************************
1490         
1491    Write utf symbol into file.
1492    Non-printable and non-ASCII characters are printed as '?'.
1493
1494 *******************************************************************************/
1495
1496 void utf_fprint_printable_ascii(FILE *file, utf *u)
1497 {
1498         char *endpos;                       /* points behind utf string           */
1499         char *utf_ptr;                      /* current position in utf text       */
1500
1501         if (!u)
1502                 return;
1503
1504         endpos = UTF_END(u);
1505         utf_ptr = u->text;
1506
1507         while (utf_ptr < endpos) { 
1508                 /* read next unicode character */                
1509                 u2 c = utf_nextu2(&utf_ptr);                            
1510
1511                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1512                 else fprintf(file, "?");
1513         }
1514 }
1515
1516
1517 /* utf_fprint_printable_ascii_classname ****************************************
1518         
1519    Write utf symbol into file with `/' converted to `.'.
1520    Non-printable and non-ASCII characters are printed as '?'.
1521
1522 *******************************************************************************/
1523
1524 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1525 {
1526         char *endpos;                       /* points behind utf string           */
1527         char *utf_ptr;                      /* current position in utf text       */
1528
1529     if (!u)
1530                 return;
1531
1532         endpos = UTF_END(u);
1533         utf_ptr = u->text;
1534
1535         while (utf_ptr < endpos) { 
1536                 /* read next unicode character */                
1537                 u2 c = utf_nextu2(&utf_ptr);                            
1538                 if (c == '/') c = '.';
1539
1540                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1541                 else fprintf(file, "?");
1542         }
1543 }
1544
1545
1546 /* is_valid_utf ****************************************************************
1547
1548    Return true if the given string is a valid UTF-8 string.
1549
1550    utf_ptr...points to first character
1551    end_pos...points after last character
1552
1553 *******************************************************************************/
1554
1555 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1556
1557 bool is_valid_utf(char *utf_ptr, char *end_pos)
1558 {
1559         int bytes;
1560         int len,i;
1561         char c;
1562         unsigned long v;
1563
1564         if (end_pos < utf_ptr) return false;
1565         bytes = end_pos - utf_ptr;
1566         while (bytes--) {
1567                 c = *utf_ptr++;
1568
1569                 if (!c) return false;                     /* 0x00 is not allowed */
1570                 if ((c & 0x80) == 0) continue;            /* ASCII */
1571
1572                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1573                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1574                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1575                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1576                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1577                 else return false;                        /* invalid leading byte */
1578
1579                 if (len > 2) return false;                /* Java limitation */
1580
1581                 v = (unsigned long)c & (0x3f >> len);
1582                 
1583                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1584
1585                 for (i = len; i--; ) {
1586                         c = *utf_ptr++;
1587                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1588                                 return false;
1589                         v = (v << 6) | (c & 0x3f);
1590                 }
1591
1592                 if (v == 0) {
1593                         if (len != 1) return false;           /* Java special */
1594
1595                 } else {
1596                         /* Sun Java seems to allow overlong UTF-8 encodings */
1597                         
1598                         /* if (v < min_codepoint[len]) */
1599                                 /* XXX throw exception? */
1600                 }
1601
1602                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1603                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1604
1605                 /* even these seem to be allowed */
1606                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1607         }
1608
1609         return true;
1610 }
1611
1612
1613 /* is_valid_name ***************************************************************
1614
1615    Return true if the given string may be used as a class/field/method
1616    name. (Currently this only disallows empty strings and control
1617    characters.)
1618
1619    NOTE: The string is assumed to have passed is_valid_utf!
1620
1621    utf_ptr...points to first character
1622    end_pos...points after last character
1623
1624 *******************************************************************************/
1625
1626 bool is_valid_name(char *utf_ptr, char *end_pos)
1627 {
1628         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1629
1630         while (utf_ptr < end_pos) {
1631                 unsigned char c = *utf_ptr++;
1632
1633                 if (c < 0x20) return false; /* disallow control characters */
1634                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1635                         return false;
1636         }
1637
1638         return true;
1639 }
1640
1641 bool is_valid_name_utf(utf *u)
1642 {
1643         return is_valid_name(u->text, UTF_END(u));
1644 }
1645
1646
1647 /* utf_show ********************************************************************
1648
1649    Writes the utf symbols in the utfhash to stdout and displays the
1650    number of external hash chains grouped according to the chainlength
1651    (for debugging purposes).
1652
1653 *******************************************************************************/
1654
1655 #if !defined(NDEBUG)
1656 void utf_show(void)
1657 {
1658
1659 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1660
1661         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1662         u4 max_chainlength = 0;      /* maximum length of the chains */
1663         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1664         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1665         u4 i;
1666
1667         printf("UTF-HASH:\n");
1668
1669         /* show element of utf-hashtable */
1670
1671         for (i = 0; i < hashtable_utf->size; i++) {
1672                 utf *u = hashtable_utf->ptr[i];
1673
1674                 if (u) {
1675                         printf("SLOT %d: ", (int) i);
1676
1677                         while (u) {
1678                                 printf("'");
1679                                 utf_display_printable_ascii(u);
1680                                 printf("' ");
1681                                 u = u->hashlink;
1682                         }       
1683                         printf("\n");
1684                 }
1685         }
1686
1687         printf("UTF-HASH: %d slots for %d entries\n", 
1688                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1689
1690         if (hashtable_utf->entries == 0)
1691                 return;
1692
1693         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1694
1695         for (i=0;i<CHAIN_LIMIT;i++)
1696                 chain_count[i]=0;
1697
1698         /* count numbers of hashchains according to their length */
1699         for (i=0; i<hashtable_utf->size; i++) {
1700                   
1701                 utf *u = (utf*) hashtable_utf->ptr[i];
1702                 u4 chain_length = 0;
1703
1704                 /* determine chainlength */
1705                 while (u) {
1706                         u = u->hashlink;
1707                         chain_length++;
1708                 }
1709
1710                 /* update sum of all chainlengths */
1711                 sum_chainlength+=chain_length;
1712
1713                 /* determine the maximum length of the chains */
1714                 if (chain_length>max_chainlength)
1715                         max_chainlength = chain_length;
1716
1717                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1718                 if (chain_length>=CHAIN_LIMIT) {
1719                         beyond_limit+=chain_length;
1720                         chain_length=CHAIN_LIMIT-1;
1721                 }
1722
1723                 /* update number of hashchains of current length */
1724                 chain_count[chain_length]++;
1725         }
1726
1727         /* display results */  
1728         for (i=1;i<CHAIN_LIMIT-1;i++) 
1729                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1730           
1731         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1732
1733
1734         printf("max. chainlength:%5d\n",max_chainlength);
1735
1736         /* avg. chainlength = sum of chainlengths / number of chains */
1737         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1738 }
1739 #endif /* !defined(NDEBUG) */
1740
1741
1742 /*
1743  * These are local overrides for various environment variables in Emacs.
1744  * Please do not remove this and leave it at the end of the file, where
1745  * Emacs will automagically detect them.
1746  * ---------------------------------------------------------------------
1747  * Local variables:
1748  * mode: c
1749  * indent-tabs-mode: t
1750  * c-basic-offset: 4
1751  * tab-width: 4
1752  * End:
1753  * vim:noexpandtab:sw=4:ts=4:
1754  */