* src/threads/native/threads.c: Rewritten such that threadobject
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf8.c - utf8 string functions
2
3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
6    J. Wenninger, Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23    02110-1301, USA.
24
25    Contact: cacao@cacaojvm.org
26
27    Authors: Reinhard Grafl
28             Mark Probst
29             Andreas Krall
30             Christian Thalinger
31             Edwin Steiner
32
33    $Id: utf8.c 6228 2006-12-26 19:56:58Z twisti $
34
35 */
36
37
38 #include "config.h"
39
40 #include <string.h>
41 #include <assert.h>
42
43 #include "vm/types.h"
44
45 #include "mm/memory.h"
46
47 #if defined(ENABLE_THREADS)
48 # include "threads/native/lock.h"
49 #else
50 # include "threads/none/lock.h"
51 #endif
52
53 #include "vm/builtin.h"
54 #include "vm/exceptions.h"
55 #include "vm/hashtable.h"
56 #include "vm/options.h"
57 #include "vm/statistics.h"
58 #include "vm/stringlocal.h"
59 #include "vm/utf8.h"
60
61
62 /* global variables ***********************************************************/
63
64 /* hashsize must be power of 2 */
65
66 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
67
68 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
69
70
71 /* utf-symbols for pointer comparison of frequently used strings **************/
72
73 utf *utf_java_lang_Object;
74
75 utf *utf_java_lang_Class;
76 utf *utf_java_lang_ClassLoader;
77 utf *utf_java_lang_Cloneable;
78 utf *utf_java_lang_SecurityManager;
79 utf *utf_java_lang_String;
80 utf *utf_java_lang_System;
81 utf *utf_java_lang_ThreadGroup;
82 utf *utf_java_io_Serializable;
83
84 utf *utf_java_lang_Throwable;
85 utf *utf_java_lang_VMThrowable;
86 utf *utf_java_lang_Error;
87 utf *utf_java_lang_AbstractMethodError;
88 utf *utf_java_lang_LinkageError;
89 utf *utf_java_lang_NoClassDefFoundError;
90 utf *utf_java_lang_NoSuchMethodError;
91 utf *utf_java_lang_OutOfMemoryError;
92
93 utf *utf_java_lang_Exception;
94 utf *utf_java_lang_ClassCastException;
95 utf *utf_java_lang_ClassNotFoundException;
96 utf *utf_java_lang_IllegalArgumentException;
97 utf *utf_java_lang_IllegalMonitorStateException;
98
99 utf *utf_java_lang_NullPointerException;
100
101 utf* utf_java_lang_Void;
102 utf* utf_java_lang_Boolean;
103 utf* utf_java_lang_Byte;
104 utf* utf_java_lang_Character;
105 utf* utf_java_lang_Short;
106 utf* utf_java_lang_Integer;
107 utf* utf_java_lang_Long;
108 utf* utf_java_lang_Float;
109 utf* utf_java_lang_Double;
110
111 utf *utf_java_lang_StackTraceElement;
112 utf *utf_java_lang_reflect_Constructor;
113 utf *utf_java_lang_reflect_Field;
114 utf *utf_java_lang_reflect_Method;
115 utf *utf_java_util_Vector;
116
117 utf *utf_InnerClasses;                  /* InnerClasses                       */
118 utf *utf_ConstantValue;                 /* ConstantValue                      */
119 utf *utf_Code;                          /* Code                               */
120 utf *utf_Exceptions;                    /* Exceptions                         */
121 utf *utf_LineNumberTable;               /* LineNumberTable                    */
122 utf *utf_SourceFile;                    /* SourceFile                         */
123
124 #if defined(ENABLE_JAVASE)
125 utf *utf_EnclosingMethod;
126 utf *utf_Signature;
127 utf *utf_RuntimeVisibleAnnotations;
128 utf *utf_StackMapTable;
129 #endif
130
131 utf *utf_init;                          /* <init>                             */
132 utf *utf_clinit;                        /* <clinit>                           */
133 utf *utf_clone;                         /* clone                              */
134 utf *utf_finalize;                      /* finalize                           */
135 utf *utf_run;                           /* run                                */
136
137 utf *utf_add;
138 utf *utf_remove;
139 utf *utf_addThread;
140 utf *utf_removeThread;
141 utf *utf_put;
142 utf *utf_get;
143 utf *utf_value;
144
145 utf *utf_fillInStackTrace;
146 utf *utf_getSystemClassLoader;
147 utf *utf_loadClass;
148 utf *utf_printStackTrace;
149
150 utf *utf_Z;                             /* Z                                  */
151 utf *utf_B;                             /* B                                  */
152 utf *utf_C;                             /* C                                  */
153 utf *utf_S;                             /* S                                  */
154 utf *utf_I;                             /* I                                  */
155 utf *utf_J;                             /* J                                  */
156 utf *utf_F;                             /* F                                  */
157 utf *utf_D;                             /* D                                  */
158
159 utf *utf_void__void;                    /* ()V                                */
160 utf *utf_boolean__void;                 /* (Z)V                               */
161 utf *utf_byte__void;                    /* (B)V                               */
162 utf *utf_char__void;                    /* (C)V                               */
163 utf *utf_short__void;                   /* (S)V                               */
164 utf *utf_int__void;                     /* (I)V                               */
165 utf *utf_long__void;                    /* (J)V                               */
166 utf *utf_float__void;                   /* (F)V                               */
167 utf *utf_double__void;                  /* (D)V                               */
168
169 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
170 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
171 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
172 utf *utf_java_lang_Object__java_lang_Object;
173 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
174 utf *utf_java_lang_String__java_lang_Class;
175 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
176 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
177
178 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
179 utf *utf_null;
180 utf *array_packagename;
181
182
183 /* utf_init ********************************************************************
184
185    Initializes the utf8 subsystem.
186
187 *******************************************************************************/
188
189 bool utf8_init(void)
190 {
191         /* create utf8 hashtable */
192
193         hashtable_utf = NEW(hashtable);
194
195         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
196
197 #if defined(ENABLE_STATISTICS)
198         if (opt_stat)
199                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
200 #endif
201
202         /* create utf-symbols for pointer comparison of frequently used strings */
203
204         utf_java_lang_Object           = utf_new_char("java/lang/Object");
205
206         utf_java_lang_Class            = utf_new_char("java/lang/Class");
207         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
208         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
209         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
210         utf_java_lang_String           = utf_new_char("java/lang/String");
211         utf_java_lang_System           = utf_new_char("java/lang/System");
212         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
213         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
214
215         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
216         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
217         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
218
219         utf_java_lang_AbstractMethodError =
220                 utf_new_char(string_java_lang_AbstractMethodError);
221
222         utf_java_lang_LinkageError =
223                 utf_new_char(string_java_lang_LinkageError);
224
225         utf_java_lang_NoClassDefFoundError =
226                 utf_new_char(string_java_lang_NoClassDefFoundError);
227
228         utf_java_lang_NoSuchMethodError =
229                 utf_new_char(string_java_lang_NoSuchMethodError);
230
231         utf_java_lang_OutOfMemoryError =
232                 utf_new_char(string_java_lang_OutOfMemoryError);
233
234         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
235
236         utf_java_lang_ClassCastException =
237                 utf_new_char(string_java_lang_ClassCastException);
238
239         utf_java_lang_ClassNotFoundException =
240                 utf_new_char(string_java_lang_ClassNotFoundException);
241
242         utf_java_lang_IllegalArgumentException =
243                 utf_new_char(string_java_lang_IllegalArgumentException);
244
245         utf_java_lang_IllegalMonitorStateException =
246                 utf_new_char(string_java_lang_IllegalMonitorStateException);
247
248         utf_java_lang_NullPointerException =
249                 utf_new_char(string_java_lang_NullPointerException);
250
251         utf_java_lang_Void             = utf_new_char("java/lang/Void");
252         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
253         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
254         utf_java_lang_Character        = utf_new_char("java/lang/Character");
255         utf_java_lang_Short            = utf_new_char("java/lang/Short");
256         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
257         utf_java_lang_Long             = utf_new_char("java/lang/Long");
258         utf_java_lang_Float            = utf_new_char("java/lang/Float");
259         utf_java_lang_Double           = utf_new_char("java/lang/Double");
260
261         utf_java_lang_StackTraceElement =
262                 utf_new_char("java/lang/StackTraceElement");
263
264         utf_java_lang_reflect_Constructor =
265                 utf_new_char("java/lang/reflect/Constructor");
266
267         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
268         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
269         utf_java_util_Vector           = utf_new_char("java/util/Vector");
270
271         utf_InnerClasses               = utf_new_char("InnerClasses");
272         utf_ConstantValue              = utf_new_char("ConstantValue");
273         utf_Code                       = utf_new_char("Code");
274         utf_Exceptions                 = utf_new_char("Exceptions");
275         utf_LineNumberTable            = utf_new_char("LineNumberTable");
276         utf_SourceFile                 = utf_new_char("SourceFile");
277
278 #if defined(ENABLE_JAVASE)
279         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
280         utf_Signature                  = utf_new_char("Signature");
281         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
282         utf_StackMapTable              = utf_new_char("StackMapTable");
283 #endif
284
285         utf_init                           = utf_new_char("<init>");
286         utf_clinit                         = utf_new_char("<clinit>");
287         utf_clone                      = utf_new_char("clone");
288         utf_finalize                   = utf_new_char("finalize");
289         utf_run                        = utf_new_char("run");
290
291         utf_add                        = utf_new_char("add");
292         utf_remove                     = utf_new_char("remove");
293         utf_addThread                  = utf_new_char("addThread");
294         utf_removeThread               = utf_new_char("removeThread");
295         utf_put                        = utf_new_char("put");
296         utf_get                        = utf_new_char("get");
297         utf_value                      = utf_new_char("value");
298
299         utf_printStackTrace            = utf_new_char("printStackTrace");
300         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
301         utf_loadClass                  = utf_new_char("loadClass");
302         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
303
304         utf_Z                          = utf_new_char("Z");
305         utf_B                          = utf_new_char("B");
306         utf_C                          = utf_new_char("C");
307         utf_S                          = utf_new_char("S");
308         utf_I                          = utf_new_char("I");
309         utf_J                          = utf_new_char("J");
310         utf_F                          = utf_new_char("F");
311         utf_D                          = utf_new_char("D");
312
313         utf_void__void                 = utf_new_char("()V");
314         utf_boolean__void              = utf_new_char("(Z)V");
315         utf_byte__void                 = utf_new_char("(B)V");
316         utf_char__void                 = utf_new_char("(C)V");
317         utf_short__void                = utf_new_char("(S)V");
318         utf_int__void                  = utf_new_char("(I)V");
319         utf_long__void                 = utf_new_char("(J)V");
320         utf_float__void                = utf_new_char("(F)V");
321         utf_double__void               = utf_new_char("(D)V");
322         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
323         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
324
325         utf_void__java_lang_ClassLoader =
326                 utf_new_char("()Ljava/lang/ClassLoader;");
327
328         utf_java_lang_Object__java_lang_Object =
329                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
330
331         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
332
333         utf_java_lang_String__java_lang_Class =
334                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
335
336         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
337         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
338
339         utf_null                       = utf_new_char("null");
340         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
341         array_packagename              = utf_new_char("\t<the array package>");
342
343         /* everything's ok */
344
345         return true;
346 }
347
348
349 /* utf_hashkey *****************************************************************
350
351    The hashkey is computed from the utf-text by using up to 8
352    characters.  For utf-symbols longer than 15 characters 3 characters
353    are taken from the beginning and the end, 2 characters are taken
354    from the middle.
355
356 *******************************************************************************/
357
358 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
359 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
360
361 u4 utf_hashkey(const char *text, u4 length)
362 {
363         const char *start_pos = text;       /* pointer to utf text                */
364         u4 a;
365
366         switch (length) {
367         case 0: /* empty string */
368                 return 0;
369
370         case 1: return fbs(0);
371         case 2: return fbs(0) ^ nbs(3);
372         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
373         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
374         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
375         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
376         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
377         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
378
379         case 9:
380                 a = fbs(0);
381                 a ^= nbs(1);
382                 a ^= nbs(2);
383                 text++;
384                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
385
386         case 10:
387                 a = fbs(0);
388                 text++;
389                 a ^= nbs(2);
390                 a ^= nbs(3);
391                 a ^= nbs(4);
392                 text++;
393                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
394
395         case 11:
396                 a = fbs(0);
397                 text++;
398                 a ^= nbs(2);
399                 a ^= nbs(3);
400                 a ^= nbs(4);
401                 text++;
402                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
403
404         case 12:
405                 a = fbs(0);
406                 text += 2;
407                 a ^= nbs(2);
408                 a ^= nbs(3);
409                 text++;
410                 a ^= nbs(5);
411                 a ^= nbs(6);
412                 a ^= nbs(7);
413                 text++;
414                 return a ^ nbs(9) ^ nbs(10);
415
416         case 13:
417                 a = fbs(0);
418                 a ^= nbs(1);
419                 text++;
420                 a ^= nbs(3);
421                 a ^= nbs(4);
422                 text += 2;      
423                 a ^= nbs(7);
424                 a ^= nbs(8);
425                 text += 2;
426                 return a ^ nbs(9) ^ nbs(10);
427
428         case 14:
429                 a = fbs(0);
430                 text += 2;      
431                 a ^= nbs(3);
432                 a ^= nbs(4);
433                 text += 2;      
434                 a ^= nbs(7);
435                 a ^= nbs(8);
436                 text += 2;
437                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
438
439         case 15:
440                 a = fbs(0);
441                 text += 2;      
442                 a ^= nbs(3);
443                 a ^= nbs(4);
444                 text += 2;      
445                 a ^= nbs(7);
446                 a ^= nbs(8);
447                 text += 2;
448                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
449
450         default:  /* 3 characters from beginning */
451                 a = fbs(0);
452                 text += 2;
453                 a ^= nbs(3);
454                 a ^= nbs(4);
455
456                 /* 2 characters from middle */
457                 text = start_pos + (length / 2);
458                 a ^= fbs(5);
459                 text += 2;
460                 a ^= nbs(6);    
461
462                 /* 3 characters from end */
463                 text = start_pos + length - 4;
464
465                 a ^= fbs(7);
466                 text++;
467
468                 return a ^ nbs(10) ^ nbs(11);
469     }
470 }
471
472 /* utf_full_hashkey ************************************************************
473
474    This function computes a hash value using all bytes in the string.
475
476    The algorithm is the "One-at-a-time" algorithm as published
477    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
478
479 *******************************************************************************/
480
481 u4 utf_full_hashkey(const char *text, u4 length)
482 {
483         register const unsigned char *p = (const unsigned char *) text;
484         register u4 hash;
485         register u4 i;
486
487         hash = 0;
488         for (i=length; i--;)
489         {
490             hash += *p++;
491             hash += (hash << 10);
492             hash ^= (hash >> 6);
493         }
494         hash += (hash << 3);
495         hash ^= (hash >> 11);
496         hash += (hash << 15);
497
498         return hash;
499 }
500
501 /* unicode_hashkey *************************************************************
502
503    Compute the hashkey of a unicode string.
504
505 *******************************************************************************/
506
507 u4 unicode_hashkey(u2 *text, u2 len)
508 {
509         return utf_hashkey((char *) text, len);
510 }
511
512
513 /* utf_new *********************************************************************
514
515    Creates a new utf-symbol, the text of the symbol is passed as a
516    u1-array. The function searches the utf-hashtable for a utf-symbol
517    with this text. On success the element returned, otherwise a new
518    hashtable element is created.
519
520    If the number of entries in the hashtable exceeds twice the size of
521    the hashtable slots a reorganization of the hashtable is done and
522    the utf symbols are copied to a new hashtable with doubled size.
523
524 *******************************************************************************/
525
526 utf *utf_new(const char *text, u2 length)
527 {
528         u4 key;                             /* hashkey computed from utf-text     */
529         u4 slot;                            /* slot in hashtable                  */
530         utf *u;                             /* hashtable element                  */
531         u2 i;
532
533         LOCK_MONITOR_ENTER(hashtable_utf->header);
534
535 #if defined(ENABLE_STATISTICS)
536         if (opt_stat)
537                 count_utf_new++;
538 #endif
539
540         key  = utf_hashkey(text, length);
541         slot = key & (hashtable_utf->size - 1);
542         u    = hashtable_utf->ptr[slot];
543
544         /* search external hash chain for utf-symbol */
545
546         while (u) {
547                 if (u->blength == length) {
548                         /* compare text of hashtable elements */
549
550                         for (i = 0; i < length; i++)
551                                 if (text[i] != u->text[i])
552                                         goto nomatch;
553                         
554 #if defined(ENABLE_STATISTICS)
555                         if (opt_stat)
556                                 count_utf_new_found++;
557 #endif
558
559                         /* symbol found in hashtable */
560
561                         LOCK_MONITOR_EXIT(hashtable_utf->header);
562
563                         return u;
564                 }
565
566         nomatch:
567                 u = u->hashlink; /* next element in external chain */
568         }
569
570 #if defined(ENABLE_STATISTICS)
571         if (opt_stat)
572                 count_utf_len += sizeof(utf) + length + 1;
573 #endif
574
575         /* location in hashtable found, create new utf element */
576         u = NEW(utf);
577         u->blength  = length;               /* length in bytes of utfstring       */
578         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
579         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
580
581         memcpy(u->text, text, length);      /* copy utf-text                      */
582         u->text[length] = '\0';
583
584         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
585         hashtable_utf->entries++;           /* update number of entries           */
586
587         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
588
589         /* reorganization of hashtable, average length of the external
590            chains is approx. 2 */
591
592                 hashtable *newhash;                              /* the new hashtable */
593                 u4         i;
594                 utf       *u;
595                 utf       *nextu;
596                 u4         slot;
597
598                 /* create new hashtable, double the size */
599
600                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
601
602 #if defined(ENABLE_STATISTICS)
603                 if (opt_stat)
604                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
605 #endif
606
607                 /* transfer elements to new hashtable */
608
609                 for (i = 0; i < hashtable_utf->size; i++) {
610                         u = hashtable_utf->ptr[i];
611
612                         while (u) {
613                                 nextu = u->hashlink;
614                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
615                                                 
616                                 u->hashlink = (utf *) newhash->ptr[slot];
617                                 newhash->ptr[slot] = u;
618
619                                 /* follow link in external hash chain */
620
621                                 u = nextu;
622                         }
623                 }
624         
625                 /* dispose old table */
626
627                 hashtable_free(hashtable_utf);
628
629                 hashtable_utf = newhash;
630         }
631
632         LOCK_MONITOR_EXIT(hashtable_utf->header);
633
634         return u;
635 }
636
637
638 /* utf_new_u2 ******************************************************************
639
640    Make utf symbol from u2 array, if isclassname is true '.' is
641    replaced by '/'.
642
643 *******************************************************************************/
644
645 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
646 {
647         char *buffer;                   /* memory buffer for  unicode characters  */
648         char *pos;                      /* pointer to current position in buffer  */
649         u4 left;                        /* unicode characters left                */
650         u4 buflength;                   /* utf length in bytes of the u2 array    */
651         utf *result;                    /* resulting utf-string                   */
652         int i;          
653
654         /* determine utf length in bytes and allocate memory */
655
656         buflength = u2_utflength(unicode_pos, unicode_length); 
657         buffer    = MNEW(char, buflength);
658  
659         left = buflength;
660         pos  = buffer;
661
662         for (i = 0; i++ < unicode_length; unicode_pos++) {
663                 /* next unicode character */
664                 u2 c = *unicode_pos;
665                 
666                 if ((c != 0) && (c < 0x80)) {
667                         /* 1 character */       
668                         left--;
669                 if ((int) left < 0) break;
670                         /* convert classname */
671                         if (isclassname && c == '.')
672                                 *pos++ = '/';
673                         else
674                                 *pos++ = (char) c;
675
676                 } else if (c < 0x800) {             
677                         /* 2 characters */                              
678                 unsigned char high = c >> 6;
679                 unsigned char low  = c & 0x3F;
680                         left = left - 2;
681                 if ((int) left < 0) break;
682                 *pos++ = high | 0xC0; 
683                 *pos++ = low  | 0x80;     
684
685                 } else {         
686                 /* 3 characters */                              
687                 char low  = c & 0x3f;
688                 char mid  = (c >> 6) & 0x3F;
689                 char high = c >> 12;
690                         left = left - 3;
691                 if ((int) left < 0) break;
692                 *pos++ = high | 0xE0; 
693                 *pos++ = mid  | 0x80;  
694                 *pos++ = low  | 0x80;   
695                 }
696         }
697         
698         /* insert utf-string into symbol-table */
699         result = utf_new(buffer,buflength);
700
701         MFREE(buffer, char, buflength);
702
703         return result;
704 }
705
706
707 /* utf_new_char ****************************************************************
708
709    Creates a new utf symbol, the text for this symbol is passed as a
710    c-string ( = char* ).
711
712 *******************************************************************************/
713
714 utf *utf_new_char(const char *text)
715 {
716         return utf_new(text, strlen(text));
717 }
718
719
720 /* utf_new_char_classname ******************************************************
721
722    Creates a new utf symbol, the text for this symbol is passed as a
723    c-string ( = char* ) "." characters are going to be replaced by
724    "/". Since the above function is used often, this is a separte
725    function, instead of an if.
726
727 *******************************************************************************/
728
729 utf *utf_new_char_classname(const char *text)
730 {
731         if (strchr(text, '.')) {
732                 char *txt = strdup(text);
733                 char *end = txt + strlen(txt);
734                 char *c;
735                 utf *tmpRes;
736
737                 for (c = txt; c < end; c++)
738                         if (*c == '.') *c = '/';
739
740                 tmpRes = utf_new(txt, strlen(txt));
741                 FREE(txt, 0);
742
743                 return tmpRes;
744
745         } else
746                 return utf_new(text, strlen(text));
747 }
748
749
750 /* utf_nextu2 ******************************************************************
751
752    Read the next unicode character from the utf string and increment
753    the utf-string pointer accordingly.
754
755    CAUTION: This function is unsafe for input that was not checked 
756             by is_valid_utf!
757
758 *******************************************************************************/
759
760 u2 utf_nextu2(char **utf_ptr)
761 {
762     /* uncompressed unicode character */
763     u2 unicode_char = 0;
764     /* current position in utf text */  
765     unsigned char *utf = (unsigned char *) (*utf_ptr);
766     /* bytes representing the unicode character */
767     unsigned char ch1, ch2, ch3;
768     /* number of bytes used to represent the unicode character */
769     int len = 0;
770         
771     switch ((ch1 = utf[0]) >> 4) {
772         default: /* 1 byte */
773                 (*utf_ptr)++;
774                 return (u2) ch1;
775         case 0xC: 
776         case 0xD: /* 2 bytes */
777                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
778                         unsigned char high = ch1 & 0x1F;
779                         unsigned char low  = ch2 & 0x3F;
780                         unicode_char = (high << 6) + low;
781                         len = 2;
782                 }
783                 break;
784
785         case 0xE: /* 2 or 3 bytes */
786                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
787                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
788                                 unsigned char low  = ch3 & 0x3f;
789                                 unsigned char mid  = ch2 & 0x3f;
790                                 unsigned char high = ch1 & 0x0f;
791                                 unicode_char = (((high << 6) + mid) << 6) + low;
792                                 len = 3;
793                         } else
794                                 len = 2;                                           
795                 }
796                 break;
797     }
798
799     /* update position in utf-text */
800     *utf_ptr = (char *) (utf + len);
801
802     return unicode_char;
803 }
804
805
806 /* utf_bytes *******************************************************************
807
808    Determine number of bytes (aka. octets) in the utf string.
809
810    IN:
811       u............utf string
812
813    OUT:
814       The number of octets of this utf string.
815           There is _no_ terminating zero included in this count.
816
817 *******************************************************************************/
818
819 u4 utf_bytes(utf *u)
820 {
821         return u->blength;
822 }
823
824 /* utf_get_number_of_u2s_for_buffer ********************************************
825
826    Determine number of UTF-16 u2s in the given UTF-8 buffer
827
828    CAUTION: This function is unsafe for input that was not checked 
829             by is_valid_utf!
830
831    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
832    to an array of u2s (UTF-16) and want to know how many of them you will get.
833    All other uses of this function are probably wrong.
834
835    IN:
836       buffer........points to first char in buffer
837           blength.......number of _bytes_ in the buffer
838
839    OUT:
840       the number of u2s needed to hold this string in UTF-16 encoding.
841           There is _no_ terminating zero included in this count.
842
843    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
844    exception.
845
846 *******************************************************************************/
847
848 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
849 {
850         const char *endpos;                 /* points behind utf string           */
851         const char *utf_ptr;                /* current position in utf text       */
852         u4 len = 0;                         /* number of unicode characters       */
853
854         utf_ptr = buffer;
855         endpos = utf_ptr + blength;
856
857         while (utf_ptr < endpos) {
858                 len++;
859                 /* next unicode character */
860                 utf_nextu2((char **)&utf_ptr);
861         }
862
863         assert(utf_ptr == endpos);
864
865         return len;
866 }
867
868
869 /* utf_get_number_of_u2s *******************************************************
870
871    Determine number of UTF-16 u2s in the utf string.
872
873    CAUTION: This function is unsafe for input that was not checked 
874             by is_valid_utf!
875
876    CAUTION: Use this function *only* when you want to convert a utf string
877    to an array of u2s and want to know how many of them you will get.
878    All other uses of this function are probably wrong.
879
880    IN:
881       u............utf string
882
883    OUT:
884       the number of u2s needed to hold this string in UTF-16 encoding.
885           There is _no_ terminating zero included in this count.
886           XXX 0 if a NullPointerException has been thrown (see below)
887
888 *******************************************************************************/
889
890 u4 utf_get_number_of_u2s(utf *u)
891 {
892         char *endpos;                       /* points behind utf string           */
893         char *utf_ptr;                      /* current position in utf text       */
894         u4 len = 0;                         /* number of unicode characters       */
895
896         /* XXX this is probably not checked by most callers! Review this after */
897         /* the invalid uses of this function have been eliminated */
898         if (!u) {
899                 exceptions_throw_nullpointerexception();
900                 return 0;
901         }
902
903         endpos = UTF_END(u);
904         utf_ptr = u->text;
905
906         while (utf_ptr < endpos) {
907                 len++;
908                 /* next unicode character */
909                 utf_nextu2(&utf_ptr);
910         }
911
912         if (utf_ptr != endpos)
913                 /* string ended abruptly */
914                 throw_cacao_exception_exit(string_java_lang_InternalError,
915                                                                    "Illegal utf8 string");
916
917         return len;
918 }
919
920
921 /* utf8_safe_number_of_u2s *****************************************************
922
923    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
924    (For invalid UTF-8 the U+fffd replacement character will be counted.)
925
926    This function is safe even for invalid UTF-8 strings.
927
928    IN:
929       text..........zero-terminated(!) UTF-8 string (may be invalid)
930                         must NOT be NULL
931           nbytes........strlen(text). (This is needed to completely emulate
932                         the RI).
933
934    OUT:
935       the number of u2s needed to hold this string in UTF-16 encoding.
936           There is _no_ terminating zero included in this count.
937
938 *******************************************************************************/
939
940 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
941         register const unsigned char *t;
942         register s4 byte;
943         register s4 len;
944         register const unsigned char *tlimit;
945         s4 byte1;
946         s4 byte2;
947         s4 byte3;
948         s4 value;
949         s4 skip;
950
951         assert(text);
952         assert(nbytes >= 0);
953
954         len = 0;
955         t = (const unsigned char *) text;
956         tlimit = t + nbytes;
957
958         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
959
960         while (1) {
961                 byte = *t++;
962
963                 if (byte & 0x80) {
964                         /* highest bit set, non-ASCII character */
965
966                         if ((byte & 0xe0) == 0xc0) {
967                                 /* 2-byte: should be 110..... 10...... ? */
968
969                                 if ((*t++ & 0xc0) == 0x80)
970                                         ; /* valid 2-byte */
971                                 else
972                                         t--; /* invalid */
973                         }
974                         else if ((byte & 0xf0) == 0xe0) {
975                                 /* 3-byte: should be 1110.... 10...... 10...... */
976                                 /*                            ^t                */
977
978                                 if (t + 2 > tlimit)
979                                         return len + 1; /* invalid, stop here */
980
981                                 if ((*t++ & 0xc0) == 0x80) {
982                                         if ((*t++ & 0xc0) == 0x80)
983                                                 ; /* valid 3-byte */
984                                         else
985                                                 t--; /* invalid */
986                                 }
987                                 else
988                                         t--; /* invalid */
989                         }
990                         else if ((byte & 0xf8) == 0xf0) {
991                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
992                                 /*                            ^t                         */
993
994                                 if (t + 3 > tlimit)
995                                         return len + 1; /* invalid, stop here */
996
997                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
998                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
999                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1000                                                         /* valid 4-byte UTF-8? */
1001                                                         value = ((byte  & 0x07) << 18)
1002                                                                   | ((byte1 & 0x3f) << 12)
1003                                                                   | ((byte2 & 0x3f) <<  6)
1004                                                                   | ((byte3 & 0x3f)      );
1005
1006                                                         if (value > 0x10FFFF)
1007                                                                 ; /* invalid */
1008                                                         else if (value > 0xFFFF)
1009                                                                 len += 1; /* we need surrogates */
1010                                                         else
1011                                                                 ; /* 16bit suffice */
1012                                                 }
1013                                                 else
1014                                                         t--; /* invalid */
1015                                         }
1016                                         else
1017                                                 t--; /* invalid */
1018                                 }
1019                                 else
1020                                         t--; /* invalid */
1021                         }
1022                         else if ((byte & 0xfc) == 0xf8) {
1023                                 /* invalid 5-byte */
1024                                 if (t + 4 > tlimit)
1025                                         return len + 1; /* invalid, stop here */
1026
1027                                 skip = 4;
1028                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1029                                         t++;
1030                         }
1031                         else if ((byte & 0xfe) == 0xfc) {
1032                                 /* invalid 6-byte */
1033                                 if (t + 5 > tlimit)
1034                                         return len + 1; /* invalid, stop here */
1035
1036                                 skip = 5;
1037                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1038                                         t++;
1039                         }
1040                         else
1041                                 ; /* invalid */
1042                 }
1043                 else {
1044                         /* NUL */
1045
1046                         if (byte == 0)
1047                                 break;
1048
1049                         /* ASCII character, common case */
1050                 }
1051
1052                 len++;
1053         }
1054
1055         return len;
1056 }
1057
1058
1059 /* utf8_safe_convert_to_u2s ****************************************************
1060
1061    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1062    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1063    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1064
1065    This function is safe even for invalid UTF-8 strings.
1066
1067    IN:
1068       text..........zero-terminated(!) UTF-8 string (may be invalid)
1069                         must NOT be NULL
1070           nbytes........strlen(text). (This is needed to completely emulate
1071                                         the RI).
1072           buffer........a preallocated array of u2s to receive the decoded
1073                         string. Use utf8_safe_number_of_u2s to get the
1074                                         required number of u2s for allocating this.
1075
1076 *******************************************************************************/
1077
1078 #define UNICODE_REPLACEMENT  0xfffd
1079
1080 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1081         register const unsigned char *t;
1082         register s4 byte;
1083         register const unsigned char *tlimit;
1084         s4 byte1;
1085         s4 byte2;
1086         s4 byte3;
1087         s4 value;
1088         s4 skip;
1089
1090         assert(text);
1091         assert(nbytes >= 0);
1092
1093         t = (const unsigned char *) text;
1094         tlimit = t + nbytes;
1095
1096         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1097
1098         while (1) {
1099                 byte = *t++;
1100
1101                 if (byte & 0x80) {
1102                         /* highest bit set, non-ASCII character */
1103
1104                         if ((byte & 0xe0) == 0xc0) {
1105                                 /* 2-byte: should be 110..... 10...... */
1106
1107                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1108                                         /* valid 2-byte UTF-8 */
1109                                         *buffer++ = ((byte  & 0x1f) << 6)
1110                                                           | ((byte1 & 0x3f)     );
1111                                 }
1112                                 else {
1113                                         *buffer++ = UNICODE_REPLACEMENT;
1114                                         t--;
1115                                 }
1116                         }
1117                         else if ((byte & 0xf0) == 0xe0) {
1118                                 /* 3-byte: should be 1110.... 10...... 10...... */
1119
1120                                 if (t + 2 > tlimit) {
1121                                         *buffer++ = UNICODE_REPLACEMENT;
1122                                         return;
1123                                 }
1124
1125                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1126                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1127                                                 /* valid 3-byte UTF-8 */
1128                                                 *buffer++ = ((byte  & 0x0f) << 12)
1129                                                                   | ((byte1 & 0x3f) <<  6)
1130                                                                   | ((byte2 & 0x3f)      );
1131                                         }
1132                                         else {
1133                                                 *buffer++ = UNICODE_REPLACEMENT;
1134                                                 t--;
1135                                         }
1136                                 }
1137                                 else {
1138                                         *buffer++ = UNICODE_REPLACEMENT;
1139                                         t--;
1140                                 }
1141                         }
1142                         else if ((byte & 0xf8) == 0xf0) {
1143                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1144
1145                                 if (t + 3 > tlimit) {
1146                                         *buffer++ = UNICODE_REPLACEMENT;
1147                                         return;
1148                                 }
1149
1150                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1151                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1152                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1153                                                         /* valid 4-byte UTF-8? */
1154                                                         value = ((byte  & 0x07) << 18)
1155                                                                   | ((byte1 & 0x3f) << 12)
1156                                                                   | ((byte2 & 0x3f) <<  6)
1157                                                                   | ((byte3 & 0x3f)      );
1158
1159                                                         if (value > 0x10FFFF) {
1160                                                                 *buffer++ = UNICODE_REPLACEMENT;
1161                                                         }
1162                                                         else if (value > 0xFFFF) {
1163                                                                 /* we need surrogates */
1164                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1165                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1166                                                         }
1167                                                         else
1168                                                                 *buffer++ = value; /* 16bit suffice */
1169                                                 }
1170                                                 else {
1171                                                         *buffer++ = UNICODE_REPLACEMENT;
1172                                                         t--;
1173                                                 }
1174                                         }
1175                                         else {
1176                                                 *buffer++ = UNICODE_REPLACEMENT;
1177                                                 t--;
1178                                         }
1179                                 }
1180                                 else {
1181                                         *buffer++ = UNICODE_REPLACEMENT;
1182                                         t--;
1183                                 }
1184                         }
1185                         else if ((byte & 0xfc) == 0xf8) {
1186                                 if (t + 4 > tlimit) {
1187                                         *buffer++ = UNICODE_REPLACEMENT;
1188                                         return;
1189                                 }
1190
1191                                 skip = 4;
1192                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1193                                         t++;
1194                                 *buffer++ = UNICODE_REPLACEMENT;
1195                         }
1196                         else if ((byte & 0xfe) == 0xfc) {
1197                                 if (t + 5 > tlimit) {
1198                                         *buffer++ = UNICODE_REPLACEMENT;
1199                                         return;
1200                                 }
1201
1202                                 skip = 5;
1203                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1204                                         t++;
1205                                 *buffer++ = UNICODE_REPLACEMENT;
1206                         }
1207                         else
1208                                 *buffer++ = UNICODE_REPLACEMENT;
1209                 }
1210                 else {
1211                         /* NUL */
1212
1213                         if (byte == 0)
1214                                 break;
1215
1216                         /* ASCII character, common case */
1217
1218                         *buffer++ = byte;
1219                 }
1220         }
1221 }
1222
1223
1224 /* u2_utflength ****************************************************************
1225
1226    Returns the utf length in bytes of a u2 array.
1227
1228 *******************************************************************************/
1229
1230 u4 u2_utflength(u2 *text, u4 u2_length)
1231 {
1232         u4 result_len = 0;                  /* utf length in bytes                */
1233         u2 ch;                              /* current unicode character          */
1234         u4 len;
1235         
1236         for (len = 0; len < u2_length; len++) {
1237                 /* next unicode character */
1238                 ch = *text++;
1239           
1240                 /* determine bytes required to store unicode character as utf */
1241                 if (ch && (ch < 0x80)) 
1242                         result_len++;
1243                 else if (ch < 0x800)
1244                         result_len += 2;        
1245                 else 
1246                         result_len += 3;        
1247         }
1248
1249     return result_len;
1250 }
1251
1252
1253 /* utf_copy ********************************************************************
1254
1255    Copy the given utf string byte-for-byte to a buffer.
1256
1257    IN:
1258       buffer.......the buffer
1259           u............the utf string
1260
1261 *******************************************************************************/
1262
1263 void utf_copy(char *buffer, utf *u)
1264 {
1265         /* our utf strings are zero-terminated (done by utf_new) */
1266         MCOPY(buffer, u->text, char, u->blength + 1);
1267 }
1268
1269
1270 /* utf_cat *********************************************************************
1271
1272    Append the given utf string byte-for-byte to a buffer.
1273
1274    IN:
1275       buffer.......the buffer
1276           u............the utf string
1277
1278 *******************************************************************************/
1279
1280 void utf_cat(char *buffer, utf *u)
1281 {
1282         /* our utf strings are zero-terminated (done by utf_new) */
1283         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1284 }
1285
1286
1287 /* utf_copy_classname **********************************************************
1288
1289    Copy the given utf classname byte-for-byte to a buffer.
1290    '/' is replaced by '.'
1291
1292    IN:
1293       buffer.......the buffer
1294           u............the utf string
1295
1296 *******************************************************************************/
1297
1298 void utf_copy_classname(char *buffer, utf *u)
1299 {
1300         char *bufptr;
1301         char *srcptr;
1302         char *endptr;
1303         char ch;
1304
1305         bufptr = buffer;
1306         srcptr = u->text;
1307         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1308
1309         while (srcptr != endptr) {
1310                 ch = *srcptr++;
1311                 if (ch == '/')
1312                         ch = '.';
1313                 *bufptr++ = ch;
1314         }
1315 }
1316
1317
1318 /* utf_cat *********************************************************************
1319
1320    Append the given utf classname byte-for-byte to a buffer.
1321    '/' is replaced by '.'
1322
1323    IN:
1324       buffer.......the buffer
1325           u............the utf string
1326
1327 *******************************************************************************/
1328
1329 void utf_cat_classname(char *buffer, utf *u)
1330 {
1331         utf_copy_classname(buffer + strlen(buffer), u);
1332 }
1333
1334 /* utf_display_printable_ascii *************************************************
1335
1336    Write utf symbol to stdout (for debugging purposes).
1337    Non-printable and non-ASCII characters are printed as '?'.
1338
1339 *******************************************************************************/
1340
1341 void utf_display_printable_ascii(utf *u)
1342 {
1343         char *endpos;                       /* points behind utf string           */
1344         char *utf_ptr;                      /* current position in utf text       */
1345
1346         if (u == NULL) {
1347                 printf("NULL");
1348                 fflush(stdout);
1349                 return;
1350         }
1351
1352         endpos = UTF_END(u);
1353         utf_ptr = u->text;
1354
1355         while (utf_ptr < endpos) {
1356                 /* read next unicode character */
1357
1358                 u2 c = utf_nextu2(&utf_ptr);
1359
1360                 if ((c >= 32) && (c <= 127))
1361                         printf("%c", c);
1362                 else
1363                         printf("?");
1364         }
1365
1366         fflush(stdout);
1367 }
1368
1369
1370 /* utf_display_printable_ascii_classname ***************************************
1371
1372    Write utf symbol to stdout with `/' converted to `.' (for debugging
1373    purposes).
1374    Non-printable and non-ASCII characters are printed as '?'.
1375
1376 *******************************************************************************/
1377
1378 void utf_display_printable_ascii_classname(utf *u)
1379 {
1380         char *endpos;                       /* points behind utf string           */
1381         char *utf_ptr;                      /* current position in utf text       */
1382
1383         if (u == NULL) {
1384                 printf("NULL");
1385                 fflush(stdout);
1386                 return;
1387         }
1388
1389         endpos = UTF_END(u);
1390         utf_ptr = u->text;
1391
1392         while (utf_ptr < endpos) {
1393                 /* read next unicode character */
1394
1395                 u2 c = utf_nextu2(&utf_ptr);
1396
1397                 if (c == '/')
1398                         c = '.';
1399
1400                 if ((c >= 32) && (c <= 127))
1401                         printf("%c", c);
1402                 else
1403                         printf("?");
1404         }
1405
1406         fflush(stdout);
1407 }
1408
1409
1410 /* utf_sprint_convert_to_latin1 ************************************************
1411         
1412    Write utf symbol into c-string (for debugging purposes).
1413    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1414    invalid results.
1415
1416 *******************************************************************************/
1417
1418 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1419 {
1420         char *endpos;                       /* points behind utf string           */
1421         char *utf_ptr;                      /* current position in utf text       */
1422         u2 pos = 0;                         /* position in c-string               */
1423
1424         if (!u) {
1425                 strcpy(buffer, "NULL");
1426                 return;
1427         }
1428
1429         endpos = UTF_END(u);
1430         utf_ptr = u->text;
1431
1432         while (utf_ptr < endpos) 
1433                 /* copy next unicode character */       
1434                 buffer[pos++] = utf_nextu2(&utf_ptr);
1435
1436         /* terminate string */
1437         buffer[pos] = '\0';
1438 }
1439
1440
1441 /* utf_sprint_convert_to_latin1_classname **************************************
1442         
1443    Write utf symbol into c-string with `/' converted to `.' (for debugging
1444    purposes).
1445    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1446    invalid results.
1447
1448 *******************************************************************************/
1449
1450 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1451 {
1452         char *endpos;                       /* points behind utf string           */
1453         char *utf_ptr;                      /* current position in utf text       */
1454         u2 pos = 0;                         /* position in c-string               */
1455
1456         if (!u) {
1457                 strcpy(buffer, "NULL");
1458                 return;
1459         }
1460
1461         endpos = UTF_END(u);
1462         utf_ptr = u->text;
1463
1464         while (utf_ptr < endpos) {
1465                 /* copy next unicode character */       
1466                 u2 c = utf_nextu2(&utf_ptr);
1467                 if (c == '/') c = '.';
1468                 buffer[pos++] = c;
1469         }
1470
1471         /* terminate string */
1472         buffer[pos] = '\0';
1473 }
1474
1475
1476 /* utf_strcat_convert_to_latin1 ************************************************
1477         
1478    Like libc strcat, but uses an utf8 string.
1479    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1480    invalid results.
1481
1482 *******************************************************************************/
1483
1484 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1485 {
1486         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1487 }
1488
1489
1490 /* utf_strcat_convert_to_latin1_classname **************************************
1491         
1492    Like libc strcat, but uses an utf8 string.
1493    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1494    invalid results.
1495
1496 *******************************************************************************/
1497
1498 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1499 {
1500         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1501 }
1502
1503
1504 /* utf_fprint_printable_ascii **************************************************
1505         
1506    Write utf symbol into file.
1507    Non-printable and non-ASCII characters are printed as '?'.
1508
1509 *******************************************************************************/
1510
1511 void utf_fprint_printable_ascii(FILE *file, utf *u)
1512 {
1513         char *endpos;                       /* points behind utf string           */
1514         char *utf_ptr;                      /* current position in utf text       */
1515
1516         if (!u)
1517                 return;
1518
1519         endpos = UTF_END(u);
1520         utf_ptr = u->text;
1521
1522         while (utf_ptr < endpos) { 
1523                 /* read next unicode character */                
1524                 u2 c = utf_nextu2(&utf_ptr);                            
1525
1526                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1527                 else fprintf(file, "?");
1528         }
1529 }
1530
1531
1532 /* utf_fprint_printable_ascii_classname ****************************************
1533         
1534    Write utf symbol into file with `/' converted to `.'.
1535    Non-printable and non-ASCII characters are printed as '?'.
1536
1537 *******************************************************************************/
1538
1539 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1540 {
1541         char *endpos;                       /* points behind utf string           */
1542         char *utf_ptr;                      /* current position in utf text       */
1543
1544     if (!u)
1545                 return;
1546
1547         endpos = UTF_END(u);
1548         utf_ptr = u->text;
1549
1550         while (utf_ptr < endpos) { 
1551                 /* read next unicode character */                
1552                 u2 c = utf_nextu2(&utf_ptr);                            
1553                 if (c == '/') c = '.';
1554
1555                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1556                 else fprintf(file, "?");
1557         }
1558 }
1559
1560
1561 /* is_valid_utf ****************************************************************
1562
1563    Return true if the given string is a valid UTF-8 string.
1564
1565    utf_ptr...points to first character
1566    end_pos...points after last character
1567
1568 *******************************************************************************/
1569
1570 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1571
1572 bool is_valid_utf(char *utf_ptr, char *end_pos)
1573 {
1574         int bytes;
1575         int len,i;
1576         char c;
1577         unsigned long v;
1578
1579         if (end_pos < utf_ptr) return false;
1580         bytes = end_pos - utf_ptr;
1581         while (bytes--) {
1582                 c = *utf_ptr++;
1583
1584                 if (!c) return false;                     /* 0x00 is not allowed */
1585                 if ((c & 0x80) == 0) continue;            /* ASCII */
1586
1587                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1588                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1589                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1590                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1591                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1592                 else return false;                        /* invalid leading byte */
1593
1594                 if (len > 2) return false;                /* Java limitation */
1595
1596                 v = (unsigned long)c & (0x3f >> len);
1597                 
1598                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1599
1600                 for (i = len; i--; ) {
1601                         c = *utf_ptr++;
1602                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1603                                 return false;
1604                         v = (v << 6) | (c & 0x3f);
1605                 }
1606
1607                 if (v == 0) {
1608                         if (len != 1) return false;           /* Java special */
1609
1610                 } else {
1611                         /* Sun Java seems to allow overlong UTF-8 encodings */
1612                         
1613                         /* if (v < min_codepoint[len]) */
1614                                 /* XXX throw exception? */
1615                 }
1616
1617                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1618                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1619
1620                 /* even these seem to be allowed */
1621                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1622         }
1623
1624         return true;
1625 }
1626
1627
1628 /* is_valid_name ***************************************************************
1629
1630    Return true if the given string may be used as a class/field/method
1631    name. (Currently this only disallows empty strings and control
1632    characters.)
1633
1634    NOTE: The string is assumed to have passed is_valid_utf!
1635
1636    utf_ptr...points to first character
1637    end_pos...points after last character
1638
1639 *******************************************************************************/
1640
1641 bool is_valid_name(char *utf_ptr, char *end_pos)
1642 {
1643         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1644
1645         while (utf_ptr < end_pos) {
1646                 unsigned char c = *utf_ptr++;
1647
1648                 if (c < 0x20) return false; /* disallow control characters */
1649                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1650                         return false;
1651         }
1652
1653         return true;
1654 }
1655
1656 bool is_valid_name_utf(utf *u)
1657 {
1658         return is_valid_name(u->text, UTF_END(u));
1659 }
1660
1661
1662 /* utf_show ********************************************************************
1663
1664    Writes the utf symbols in the utfhash to stdout and displays the
1665    number of external hash chains grouped according to the chainlength
1666    (for debugging purposes).
1667
1668 *******************************************************************************/
1669
1670 #if !defined(NDEBUG)
1671 void utf_show(void)
1672 {
1673
1674 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1675
1676         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1677         u4 max_chainlength = 0;      /* maximum length of the chains */
1678         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1679         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1680         u4 i;
1681
1682         printf("UTF-HASH:\n");
1683
1684         /* show element of utf-hashtable */
1685
1686         for (i = 0; i < hashtable_utf->size; i++) {
1687                 utf *u = hashtable_utf->ptr[i];
1688
1689                 if (u) {
1690                         printf("SLOT %d: ", (int) i);
1691
1692                         while (u) {
1693                                 printf("'");
1694                                 utf_display_printable_ascii(u);
1695                                 printf("' ");
1696                                 u = u->hashlink;
1697                         }       
1698                         printf("\n");
1699                 }
1700         }
1701
1702         printf("UTF-HASH: %d slots for %d entries\n", 
1703                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1704
1705         if (hashtable_utf->entries == 0)
1706                 return;
1707
1708         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1709
1710         for (i=0;i<CHAIN_LIMIT;i++)
1711                 chain_count[i]=0;
1712
1713         /* count numbers of hashchains according to their length */
1714         for (i=0; i<hashtable_utf->size; i++) {
1715                   
1716                 utf *u = (utf*) hashtable_utf->ptr[i];
1717                 u4 chain_length = 0;
1718
1719                 /* determine chainlength */
1720                 while (u) {
1721                         u = u->hashlink;
1722                         chain_length++;
1723                 }
1724
1725                 /* update sum of all chainlengths */
1726                 sum_chainlength+=chain_length;
1727
1728                 /* determine the maximum length of the chains */
1729                 if (chain_length>max_chainlength)
1730                         max_chainlength = chain_length;
1731
1732                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1733                 if (chain_length>=CHAIN_LIMIT) {
1734                         beyond_limit+=chain_length;
1735                         chain_length=CHAIN_LIMIT-1;
1736                 }
1737
1738                 /* update number of hashchains of current length */
1739                 chain_count[chain_length]++;
1740         }
1741
1742         /* display results */  
1743         for (i=1;i<CHAIN_LIMIT-1;i++) 
1744                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1745           
1746         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1747
1748
1749         printf("max. chainlength:%5d\n",max_chainlength);
1750
1751         /* avg. chainlength = sum of chainlengths / number of chains */
1752         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1753 }
1754 #endif /* !defined(NDEBUG) */
1755
1756
1757 /*
1758  * These are local overrides for various environment variables in Emacs.
1759  * Please do not remove this and leave it at the end of the file, where
1760  * Emacs will automagically detect them.
1761  * ---------------------------------------------------------------------
1762  * Local variables:
1763  * mode: c
1764  * indent-tabs-mode: t
1765  * c-basic-offset: 4
1766  * tab-width: 4
1767  * End:
1768  * vim:noexpandtab:sw=4:ts=4:
1769  */