Added: utf_clone
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2560 2005-06-06 15:20:41Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_Exception;
69 utf *utf_java_lang_NoClassDefFoundError;
70 utf *utf_java_lang_OutOfMemoryError;
71 utf *utf_java_lang_ClassNotFoundException;
72
73 utf* utf_java_lang_Void;
74 utf* utf_java_lang_Boolean;
75 utf* utf_java_lang_Byte;
76 utf* utf_java_lang_Character;
77 utf* utf_java_lang_Short;
78 utf* utf_java_lang_Integer;
79 utf* utf_java_lang_Long;
80 utf* utf_java_lang_Float;
81 utf* utf_java_lang_Double;
82
83 utf *utf_java_util_Vector;
84 utf *utf_java_lang_reflect_Constructor;
85 utf *utf_java_lang_reflect_Method;
86
87
88 utf *utf_InnerClasses;                  /* InnerClasses                       */
89 utf *utf_ConstantValue;                 /* ConstantValue                      */
90 utf *utf_Code;                          /* Code                               */
91 utf *utf_Exceptions;                    /* Exceptions                         */
92 utf *utf_LineNumberTable;               /* LineNumberTable                    */
93 utf *utf_SourceFile;                    /* SourceFile                         */
94
95 utf *utf_init;                          /* <init>                             */
96 utf *utf_clinit;                        /* <clinit>                           */
97 utf *utf_clone;                         /* clone                              */
98 utf *utf_finalize;                      /* finalize                           */
99
100 utf *utf_printStackTrace;
101 utf *utf_fillInStackTrace;
102 utf *utf_loadClass;
103
104 utf *utf_void__void;                    /* ()V                                */
105 utf *utf_boolean__void;                 /* (Z)V                               */
106 utf *utf_byte__void;                    /* (B)V                               */
107 utf *utf_char__void;                    /* (C)V                               */
108 utf *utf_short__void;                   /* (S)V                               */
109 utf *utf_int__void;                     /* (I)V                               */
110 utf *utf_long__void;                    /* (J)V                               */
111 utf *utf_float__void;                   /* (F)V                               */
112 utf *utf_double__void;                  /* (D)V                               */
113 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
114 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
115 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
116 utf *utf_java_lang_String__java_lang_Class;
117 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
118
119 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
120
121 utf *array_packagename;
122
123
124 /* utf_init ********************************************************************
125
126    Initializes the utf8 subsystem.
127
128 *******************************************************************************/
129
130 void utf8_init(void)
131 {
132         /* create utf-symbols for pointer comparison of frequently used strings */
133
134         utf_java_lang_Object           = utf_new_char("java/lang/Object");
135
136         utf_java_lang_Class            = utf_new_char("java/lang/Class");
137         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
138         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
139         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
140         utf_java_lang_String           = utf_new_char("java/lang/String");
141         utf_java_lang_System           = utf_new_char("java/lang/System");
142         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
143         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
144
145         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
146         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
147         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
148         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
149
150         utf_java_lang_NoClassDefFoundError =
151                 utf_new_char(string_java_lang_NoClassDefFoundError);
152
153         utf_java_lang_OutOfMemoryError =
154                 utf_new_char(string_java_lang_OutOfMemoryError);
155
156         utf_java_lang_ClassNotFoundException =
157                 utf_new_char(string_java_lang_ClassNotFoundException);
158
159         utf_java_lang_Void             = utf_new_char("java/lang/Void");
160         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
161         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
162         utf_java_lang_Character        = utf_new_char("java/lang/Character");
163         utf_java_lang_Short            = utf_new_char("java/lang/Short");
164         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
165         utf_java_lang_Long             = utf_new_char("java/lang/Long");
166         utf_java_lang_Float            = utf_new_char("java/lang/Float");
167         utf_java_lang_Double           = utf_new_char("java/lang/Double");
168
169         utf_java_util_Vector           = utf_new_char("java/util/Vector");
170
171         utf_java_lang_reflect_Constructor =
172                 utf_new_char("java/lang/reflect/Constructor");
173
174         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
175
176         utf_InnerClasses               = utf_new_char("InnerClasses");
177         utf_ConstantValue              = utf_new_char("ConstantValue");
178         utf_Code                       = utf_new_char("Code");
179         utf_Exceptions                 = utf_new_char("Exceptions");
180         utf_LineNumberTable            = utf_new_char("LineNumberTable");
181         utf_SourceFile                 = utf_new_char("SourceFile");
182
183         utf_init                           = utf_new_char("<init>");
184         utf_clinit                         = utf_new_char("<clinit>");
185         utf_clone                      = utf_new_char("clone");
186         utf_finalize                   = utf_new_char("finalize");
187
188         utf_printStackTrace            = utf_new_char("printStackTrace");
189         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
190         utf_loadClass                  = utf_new_char("loadClass");
191
192         utf_void__void                 = utf_new_char("()V");
193         utf_boolean__void              = utf_new_char("(Z)V");
194         utf_byte__void                 = utf_new_char("(B)V");
195         utf_char__void                 = utf_new_char("(C)V");
196         utf_short__void                = utf_new_char("(S)V");
197         utf_int__void                  = utf_new_char("(I)V");
198         utf_long__void                 = utf_new_char("(J)V");
199         utf_float__void                = utf_new_char("(F)V");
200         utf_double__void               = utf_new_char("(D)V");
201         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
202         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
203         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
204
205         utf_java_lang_String__java_lang_Class =
206                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
207
208         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
209
210         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
211
212         array_packagename              = utf_new_char("\t<the array package>");
213 }
214
215
216 /* utf_hashkey *****************************************************************
217
218    The hashkey is computed from the utf-text by using up to 8
219    characters.  For utf-symbols longer than 15 characters 3 characters
220    are taken from the beginning and the end, 2 characters are taken
221    from the middle.
222
223 *******************************************************************************/
224
225 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
226 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
227
228 u4 utf_hashkey(const char *text, u4 length)
229 {
230         const char *start_pos = text;       /* pointer to utf text                */
231         u4 a;
232
233         switch (length) {
234         case 0: /* empty string */
235                 return 0;
236
237         case 1: return fbs(0);
238         case 2: return fbs(0) ^ nbs(3);
239         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
240         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
241         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
242         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
243         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
244         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
245
246         case 9:
247                 a = fbs(0);
248                 a ^= nbs(1);
249                 a ^= nbs(2);
250                 text++;
251                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
252
253         case 10:
254                 a = fbs(0);
255                 text++;
256                 a ^= nbs(2);
257                 a ^= nbs(3);
258                 a ^= nbs(4);
259                 text++;
260                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
261
262         case 11:
263                 a = fbs(0);
264                 text++;
265                 a ^= nbs(2);
266                 a ^= nbs(3);
267                 a ^= nbs(4);
268                 text++;
269                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
270
271         case 12:
272                 a = fbs(0);
273                 text += 2;
274                 a ^= nbs(2);
275                 a ^= nbs(3);
276                 text++;
277                 a ^= nbs(5);
278                 a ^= nbs(6);
279                 a ^= nbs(7);
280                 text++;
281                 return a ^ nbs(9) ^ nbs(10);
282
283         case 13:
284                 a = fbs(0);
285                 a ^= nbs(1);
286                 text++;
287                 a ^= nbs(3);
288                 a ^= nbs(4);
289                 text += 2;      
290                 a ^= nbs(7);
291                 a ^= nbs(8);
292                 text += 2;
293                 return a ^ nbs(9) ^ nbs(10);
294
295         case 14:
296                 a = fbs(0);
297                 text += 2;      
298                 a ^= nbs(3);
299                 a ^= nbs(4);
300                 text += 2;      
301                 a ^= nbs(7);
302                 a ^= nbs(8);
303                 text += 2;
304                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
305
306         case 15:
307                 a = fbs(0);
308                 text += 2;      
309                 a ^= nbs(3);
310                 a ^= nbs(4);
311                 text += 2;      
312                 a ^= nbs(7);
313                 a ^= nbs(8);
314                 text += 2;
315                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
316
317         default:  /* 3 characters from beginning */
318                 a = fbs(0);
319                 text += 2;
320                 a ^= nbs(3);
321                 a ^= nbs(4);
322
323                 /* 2 characters from middle */
324                 text = start_pos + (length / 2);
325                 a ^= fbs(5);
326                 text += 2;
327                 a ^= nbs(6);    
328
329                 /* 3 characters from end */
330                 text = start_pos + length - 4;
331
332                 a ^= fbs(7);
333                 text++;
334
335                 return a ^ nbs(10) ^ nbs(11);
336     }
337 }
338
339
340 /* utf_hashkey *****************************************************************
341
342    Compute the hashkey of a unicode string.
343
344 *******************************************************************************/
345
346 u4 unicode_hashkey(u2 *text, u2 len)
347 {
348         return utf_hashkey((char *) text, len);
349 }
350
351
352 /* utf_new *********************************************************************
353
354    Creates a new utf-symbol, the text of the symbol is passed as a
355    u1-array. The function searches the utf-hashtable for a utf-symbol
356    with this text. On success the element returned, otherwise a new
357    hashtable element is created.
358
359    If the number of entries in the hashtable exceeds twice the size of
360    the hashtable slots a reorganization of the hashtable is done and
361    the utf symbols are copied to a new hashtable with doubled size.
362
363 *******************************************************************************/
364
365 utf *utf_new_intern(const char *text, u2 length);
366
367 utf *utf_new(const char *text, u2 length)
368 {
369     utf *r;
370
371 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
372     tables_lock();
373 #endif
374
375     r = utf_new_intern(text, length);
376
377 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
378     tables_unlock();
379 #endif
380
381     return r;
382 }
383
384
385 utf *utf_new_intern(const char *text, u2 length)
386 {
387         u4 key;                             /* hashkey computed from utf-text     */
388         u4 slot;                            /* slot in hashtable                  */
389         utf *u;                             /* hashtable element                  */
390         u2 i;
391
392 #ifdef STATISTICS
393         if (opt_stat)
394                 count_utf_new++;
395 #endif
396
397         key  = utf_hashkey(text, length);
398         slot = key & (utf_hash.size - 1);
399         u    = utf_hash.ptr[slot];
400
401         /* search external hash chain for utf-symbol */
402         while (u) {
403                 if (u->blength == length) {
404
405                         /* compare text of hashtable elements */
406                         for (i = 0; i < length; i++)
407                                 if (text[i] != u->text[i]) goto nomatch;
408                         
409 #ifdef STATISTICS
410                         if (opt_stat)
411                                 count_utf_new_found++;
412 #endif
413
414                         /* symbol found in hashtable */
415                         return u;
416                 }
417         nomatch:
418                 u = u->hashlink; /* next element in external chain */
419         }
420
421 #ifdef STATISTICS
422         if (opt_stat)
423                 count_utf_len += sizeof(utf) + length;
424 #endif
425
426         /* location in hashtable found, create new utf element */
427         u = NEW(utf);
428         u->blength  = length;               /* length in bytes of utfstring       */
429         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
430         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
431         memcpy(u->text, text, length);      /* copy utf-text                      */
432         u->text[length] = '\0';
433         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
434
435         utf_hash.entries++;                 /* update number of entries           */
436
437         if (utf_hash.entries > (utf_hash.size * 2)) {
438
439         /* reorganization of hashtable, average length of 
440            the external chains is approx. 2                */  
441
442                 u4 i;
443                 utf *u;
444                 hashtable newhash; /* the new hashtable */
445
446                 /* create new hashtable, double the size */
447                 init_hashtable(&newhash, utf_hash.size * 2);
448                 newhash.entries = utf_hash.entries;
449
450 #ifdef STATISTICS
451                 if (opt_stat)
452                         count_utf_len += sizeof(utf*) * utf_hash.size;
453 #endif
454
455                 /* transfer elements to new hashtable */
456                 for (i = 0; i < utf_hash.size; i++) {
457                         u = (utf *) utf_hash.ptr[i];
458                         while (u) {
459                                 utf *nextu = u->hashlink;
460                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
461                                                 
462                                 u->hashlink = (utf *) newhash.ptr[slot];
463                                 newhash.ptr[slot] = u;
464
465                                 /* follow link in external hash chain */
466                                 u = nextu;
467                         }
468                 }
469         
470                 /* dispose old table */
471                 MFREE(utf_hash.ptr, void*, utf_hash.size);
472                 utf_hash = newhash;
473         }
474
475         return u;
476 }
477
478
479 /* utf_new_u2 ******************************************************************
480
481    Make utf symbol from u2 array, if isclassname is true '.' is
482    replaced by '/'.
483
484 *******************************************************************************/
485
486 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
487 {
488         char *buffer;                   /* memory buffer for  unicode characters  */
489         char *pos;                      /* pointer to current position in buffer  */
490         u4 left;                        /* unicode characters left                */
491         u4 buflength;                   /* utf length in bytes of the u2 array    */
492         utf *result;                    /* resulting utf-string                   */
493         int i;          
494
495         /* determine utf length in bytes and allocate memory */
496
497         buflength = u2_utflength(unicode_pos, unicode_length); 
498         buffer    = MNEW(char, buflength);
499  
500         left = buflength;
501         pos  = buffer;
502
503         for (i = 0; i++ < unicode_length; unicode_pos++) {
504                 /* next unicode character */
505                 u2 c = *unicode_pos;
506                 
507                 if ((c != 0) && (c < 0x80)) {
508                         /* 1 character */       
509                         left--;
510                 if ((int) left < 0) break;
511                         /* convert classname */
512                         if (isclassname && c == '.')
513                                 *pos++ = '/';
514                         else
515                                 *pos++ = (char) c;
516
517                 } else if (c < 0x800) {             
518                         /* 2 characters */                              
519                 unsigned char high = c >> 6;
520                 unsigned char low  = c & 0x3F;
521                         left = left - 2;
522                 if ((int) left < 0) break;
523                 *pos++ = high | 0xC0; 
524                 *pos++ = low  | 0x80;     
525
526                 } else {         
527                 /* 3 characters */                              
528                 char low  = c & 0x3f;
529                 char mid  = (c >> 6) & 0x3F;
530                 char high = c >> 12;
531                         left = left - 3;
532                 if ((int) left < 0) break;
533                 *pos++ = high | 0xE0; 
534                 *pos++ = mid  | 0x80;  
535                 *pos++ = low  | 0x80;   
536                 }
537         }
538         
539         /* insert utf-string into symbol-table */
540         result = utf_new(buffer,buflength);
541
542         MFREE(buffer, char, buflength);
543
544         return result;
545 }
546
547
548 /* utf_new_char ****************************************************************
549
550    Creates a new utf symbol, the text for this symbol is passed as a
551    c-string ( = char* ).
552
553 *******************************************************************************/
554
555 utf *utf_new_char(const char *text)
556 {
557         return utf_new(text, strlen(text));
558 }
559
560
561 /* utf_new_char_classname ******************************************************
562
563    Creates a new utf symbol, the text for this symbol is passed as a
564    c-string ( = char* ) "." characters are going to be replaced by
565    "/". Since the above function is used often, this is a separte
566    function, instead of an if.
567
568 *******************************************************************************/
569
570 utf *utf_new_char_classname(const char *text)
571 {
572         if (strchr(text, '.')) {
573                 char *txt = strdup(text);
574                 char *end = txt + strlen(txt);
575                 char *c;
576                 utf *tmpRes;
577
578                 for (c = txt; c < end; c++)
579                         if (*c == '.') *c = '/';
580
581                 tmpRes = utf_new(txt, strlen(txt));
582                 FREE(txt, 0);
583
584                 return tmpRes;
585
586         } else
587                 return utf_new(text, strlen(text));
588 }
589
590
591 /* utf_nextu2 ******************************************************************
592
593    Read the next unicode character from the utf string and increment
594    the utf-string pointer accordingly.
595
596 *******************************************************************************/
597
598 u2 utf_nextu2(char **utf_ptr)
599 {
600     /* uncompressed unicode character */
601     u2 unicode_char = 0;
602     /* current position in utf text */  
603     unsigned char *utf = (unsigned char *) (*utf_ptr);
604     /* bytes representing the unicode character */
605     unsigned char ch1, ch2, ch3;
606     /* number of bytes used to represent the unicode character */
607     int len = 0;
608         
609     switch ((ch1 = utf[0]) >> 4) {
610         default: /* 1 byte */
611                 (*utf_ptr)++;
612                 return (u2) ch1;
613         case 0xC: 
614         case 0xD: /* 2 bytes */
615                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
616                         unsigned char high = ch1 & 0x1F;
617                         unsigned char low  = ch2 & 0x3F;
618                         unicode_char = (high << 6) + low;
619                         len = 2;
620                 }
621                 break;
622
623         case 0xE: /* 2 or 3 bytes */
624                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
625                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
626                                 unsigned char low  = ch3 & 0x3f;
627                                 unsigned char mid  = ch2 & 0x3f;
628                                 unsigned char high = ch1 & 0x0f;
629                                 unicode_char = (((high << 6) + mid) << 6) + low;
630                                 len = 3;
631                         } else
632                                 len = 2;                                           
633                 }
634                 break;
635     }
636
637     /* update position in utf-text */
638     *utf_ptr = (char *) (utf + len);
639
640     return unicode_char;
641 }
642
643
644 /* utf_strlen ******************************************************************
645
646    Determine number of unicode characters in the utf string.
647
648 *******************************************************************************/
649
650 u4 utf_strlen(utf *u)
651 {
652         char *endpos;                       /* points behind utf string           */
653         char *utf_ptr;                      /* current position in utf text       */
654         u4 len = 0;                         /* number of unicode characters       */
655
656         if (!u) {
657                 *exceptionptr = new_nullpointerexception();
658                 return 0;
659         }
660
661         endpos = utf_end(u);
662         utf_ptr = u->text;
663
664         while (utf_ptr < endpos) {
665                 len++;
666                 /* next unicode character */
667                 utf_nextu2(&utf_ptr);
668         }
669
670         if (utf_ptr != endpos)
671                 /* string ended abruptly */
672                 throw_cacao_exception_exit(string_java_lang_InternalError,
673                                                                    "Illegal utf8 string");
674
675         return len;
676 }
677
678
679 /* u2_utflength ****************************************************************
680
681    Returns the utf length in bytes of a u2 array.
682
683 *******************************************************************************/
684
685 u4 u2_utflength(u2 *text, u4 u2_length)
686 {
687         u4 result_len = 0;                  /* utf length in bytes                */
688         u2 ch;                              /* current unicode character          */
689         u4 len;
690         
691         for (len = 0; len < u2_length; len++) {
692                 /* next unicode character */
693                 ch = *text++;
694           
695                 /* determine bytes required to store unicode character as utf */
696                 if (ch && (ch < 0x80)) 
697                         result_len++;
698                 else if (ch < 0x800)
699                         result_len += 2;        
700                 else 
701                         result_len += 3;        
702         }
703
704     return result_len;
705 }
706
707
708 /* utf_display *****************************************************************
709
710    Write utf symbol to stdout (for debugging purposes).
711
712 *******************************************************************************/
713
714 void utf_display(utf *u)
715 {
716         char *endpos;                       /* points behind utf string           */
717         char *utf_ptr;                      /* current position in utf text       */
718
719         if (!u) {
720                 printf("NULL");
721                 fflush(stdout);
722                 return;
723         }
724
725         endpos = utf_end(u);
726         utf_ptr = u->text;
727
728         while (utf_ptr < endpos) {
729                 /* read next unicode character */                
730                 u2 c = utf_nextu2(&utf_ptr);
731                 if (c >= 32 && c <= 127) printf("%c", c);
732                 else printf("?");
733         }
734
735         fflush(stdout);
736 }
737
738
739 /* utf_display_classname *******************************************************
740
741    Write utf symbol to stdout with `/' converted to `.' (for debugging
742    purposes).
743
744 *******************************************************************************/
745
746 void utf_display_classname(utf *u)
747 {
748         char *endpos;                       /* points behind utf string           */
749         char *utf_ptr;                      /* current position in utf text       */
750
751         if (!u) {
752                 printf("NULL");
753                 fflush(stdout);
754                 return;
755         }
756
757         endpos = utf_end(u);
758         utf_ptr = u->text;
759
760         while (utf_ptr < endpos) {
761                 /* read next unicode character */                
762                 u2 c = utf_nextu2(&utf_ptr);
763                 if (c == '/') c = '.';
764                 if (c >= 32 && c <= 127) printf("%c", c);
765                 else printf("?");
766         }
767
768         fflush(stdout);
769 }
770
771
772 /* utf_sprint ******************************************************************
773         
774    Write utf symbol into c-string (for debugging purposes).
775
776 *******************************************************************************/
777
778 void utf_sprint(char *buffer, utf *u)
779 {
780         char *endpos;                       /* points behind utf string           */
781         char *utf_ptr;                      /* current position in utf text       */
782         u2 pos = 0;                         /* position in c-string               */
783
784         if (!u) {
785                 strcpy(buffer, "NULL");
786                 return;
787         }
788
789         endpos = utf_end(u);
790         utf_ptr = u->text;
791
792         while (utf_ptr < endpos) 
793                 /* copy next unicode character */       
794                 buffer[pos++] = utf_nextu2(&utf_ptr);
795
796         /* terminate string */
797         buffer[pos] = '\0';
798 }
799
800
801 /* utf_sprint_classname ********************************************************
802         
803    Write utf symbol into c-string with `/' converted to `.' (for debugging
804    purposes).
805
806 *******************************************************************************/
807
808 void utf_sprint_classname(char *buffer, utf *u)
809 {
810         char *endpos;                       /* points behind utf string           */
811         char *utf_ptr;                      /* current position in utf text       */
812         u2 pos = 0;                         /* position in c-string               */
813
814         if (!u) {
815                 strcpy(buffer, "NULL");
816                 return;
817         }
818
819         endpos = utf_end(u);
820         utf_ptr = u->text;
821
822         while (utf_ptr < endpos) {
823                 /* copy next unicode character */       
824                 u2 c = utf_nextu2(&utf_ptr);
825                 if (c == '/') c = '.';
826                 buffer[pos++] = c;
827         }
828
829         /* terminate string */
830         buffer[pos] = '\0';
831 }
832
833
834 /* utf_strcat ******************************************************************
835         
836    Like libc strcat, but uses an utf8 string.
837
838 *******************************************************************************/
839
840 void utf_strcat(char *buffer, utf *u)
841 {
842         utf_sprint(buffer + strlen(buffer), u);
843 }
844
845
846 /* utf_strcat_classname ********************************************************
847         
848    Like libc strcat, but uses an utf8 string.
849
850 *******************************************************************************/
851
852 void utf_strcat_classname(char *buffer, utf *u)
853 {
854         utf_sprint_classname(buffer + strlen(buffer), u);
855 }
856
857
858 /* utf_fprint ******************************************************************
859         
860    Write utf symbol into file.
861
862 *******************************************************************************/
863
864 void utf_fprint(FILE *file, utf *u)
865 {
866         char *endpos;                       /* points behind utf string           */
867         char *utf_ptr;                      /* current position in utf text       */
868
869         if (!u)
870                 return;
871
872         endpos = utf_end(u);
873         utf_ptr = u->text;
874
875         while (utf_ptr < endpos) { 
876                 /* read next unicode character */                
877                 u2 c = utf_nextu2(&utf_ptr);                            
878
879                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
880                 else fprintf(file, "?");
881         }
882 }
883
884
885 /* utf_fprint_classname ********************************************************
886         
887    Write utf symbol into file with `/' converted to `.'.
888
889 *******************************************************************************/
890
891 void utf_fprint_classname(FILE *file, utf *u)
892 {
893         char *endpos;                       /* points behind utf string           */
894         char *utf_ptr;                      /* current position in utf text       */
895
896     if (!u)
897                 return;
898
899         endpos = utf_end(u);
900         utf_ptr = u->text;
901
902         while (utf_ptr < endpos) { 
903                 /* read next unicode character */                
904                 u2 c = utf_nextu2(&utf_ptr);                            
905                 if (c == '/') c = '.';
906
907                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
908                 else fprintf(file, "?");
909         }
910 }
911
912
913 /* is_valid_utf ****************************************************************
914
915    Return true if the given string is a valid UTF-8 string.
916
917    utf_ptr...points to first character
918    end_pos...points after last character
919
920 *******************************************************************************/
921
922 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
923
924 bool is_valid_utf(char *utf_ptr, char *end_pos)
925 {
926         int bytes;
927         int len,i;
928         char c;
929         unsigned long v;
930
931         if (end_pos < utf_ptr) return false;
932         bytes = end_pos - utf_ptr;
933         while (bytes--) {
934                 c = *utf_ptr++;
935
936                 if (!c) return false;                     /* 0x00 is not allowed */
937                 if ((c & 0x80) == 0) continue;            /* ASCII */
938
939                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
940                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
941                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
942                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
943                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
944                 else return false;                        /* invalid leading byte */
945
946                 if (len > 2) return false;                /* Java limitation */
947
948                 v = (unsigned long)c & (0x3f >> len);
949                 
950                 if ((bytes -= len) < 0) return false;     /* missing bytes */
951
952                 for (i = len; i--; ) {
953                         c = *utf_ptr++;
954                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
955                                 return false;
956                         v = (v << 6) | (c & 0x3f);
957                 }
958
959                 if (v == 0) {
960                         if (len != 1) return false;           /* Java special */
961
962                 } else {
963                         /* Sun Java seems to allow overlong UTF-8 encodings */
964                         
965                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
966                                 if (!opt_liberalutf)
967                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
968                                 /* XXX change this to exception? */
969                         }
970                 }
971
972                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
973                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
974
975                 /* even these seem to be allowed */
976                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
977         }
978
979         return true;
980 }
981
982
983 /* is_valid_name ***************************************************************
984
985    Return true if the given string may be used as a class/field/method
986    name. (Currently this only disallows empty strings and control
987    characters.)
988
989    NOTE: The string is assumed to have passed is_valid_utf!
990
991    utf_ptr...points to first character
992    end_pos...points after last character
993
994 *******************************************************************************/
995
996 bool is_valid_name(char *utf_ptr, char *end_pos)
997 {
998         if (end_pos <= utf_ptr) return false; /* disallow empty names */
999
1000         while (utf_ptr < end_pos) {
1001                 unsigned char c = *utf_ptr++;
1002
1003                 if (c < 0x20) return false; /* disallow control characters */
1004                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1005                         return false;
1006         }
1007
1008         return true;
1009 }
1010
1011 bool is_valid_name_utf(utf *u)
1012 {
1013         return is_valid_name(u->text,utf_end(u));
1014 }
1015
1016
1017 /* utf_show ********************************************************************
1018
1019    Writes the utf symbols in the utfhash to stdout and displays the
1020    number of external hash chains grouped according to the chainlength
1021    (for debugging purposes).
1022
1023 *******************************************************************************/
1024
1025 void utf_show(void)
1026 {
1027
1028 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1029
1030         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1031         u4 max_chainlength = 0;      /* maximum length of the chains */
1032         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1033         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1034         u4 i;
1035
1036         printf ("UTF-HASH:\n");
1037
1038         /* show element of utf-hashtable */
1039         for (i=0; i<utf_hash.size; i++) {
1040                 utf *u = utf_hash.ptr[i];
1041                 if (u) {
1042                         printf ("SLOT %d: ", (int) i);
1043                         while (u) {
1044                                 printf ("'");
1045                                 utf_display (u);
1046                                 printf ("' ");
1047                                 u = u->hashlink;
1048                         }       
1049                         printf ("\n");
1050                 }
1051                 
1052         }
1053
1054         printf ("UTF-HASH: %d slots for %d entries\n", 
1055                         (int) utf_hash.size, (int) utf_hash.entries );
1056
1057
1058         if (utf_hash.entries == 0)
1059                 return;
1060
1061         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1062
1063         for (i=0;i<CHAIN_LIMIT;i++)
1064                 chain_count[i]=0;
1065
1066         /* count numbers of hashchains according to their length */
1067         for (i=0; i<utf_hash.size; i++) {
1068                   
1069                 utf *u = (utf*) utf_hash.ptr[i];
1070                 u4 chain_length = 0;
1071
1072                 /* determine chainlength */
1073                 while (u) {
1074                         u = u->hashlink;
1075                         chain_length++;
1076                 }
1077
1078                 /* update sum of all chainlengths */
1079                 sum_chainlength+=chain_length;
1080
1081                 /* determine the maximum length of the chains */
1082                 if (chain_length>max_chainlength)
1083                         max_chainlength = chain_length;
1084
1085                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1086                 if (chain_length>=CHAIN_LIMIT) {
1087                         beyond_limit+=chain_length;
1088                         chain_length=CHAIN_LIMIT-1;
1089                 }
1090
1091                 /* update number of hashchains of current length */
1092                 chain_count[chain_length]++;
1093         }
1094
1095         /* display results */  
1096         for (i=1;i<CHAIN_LIMIT-1;i++) 
1097                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1098           
1099         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1100
1101
1102         printf("max. chainlength:%5d\n",max_chainlength);
1103
1104         /* avg. chainlength = sum of chainlengths / number of chains */
1105         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1106 }
1107
1108
1109 /*
1110  * These are local overrides for various environment variables in Emacs.
1111  * Please do not remove this and leave it at the end of the file, where
1112  * Emacs will automagically detect them.
1113  * ---------------------------------------------------------------------
1114  * Local variables:
1115  * mode: c
1116  * indent-tabs-mode: t
1117  * c-basic-offset: 4
1118  * tab-width: 4
1119  * End:
1120  */