FromReflectedMethod can be called for methods and constructors
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2427 2005-05-01 12:27:54Z jowenn $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/tables.h"
45 #include "vm/utf8.h"
46
47
48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
49
50
51 /* utf-symbols for pointer comparison of frequently used strings **************/
52
53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
54
55 utf *utf_java_lang_Class;
56 utf *utf_java_lang_ClassLoader;
57 utf *utf_java_lang_Cloneable;
58 utf *utf_java_lang_SecurityManager;
59 utf *utf_java_lang_String;
60 utf *utf_java_lang_System;
61 utf *utf_java_lang_ThreadGroup;
62 utf *utf_java_io_Serializable;
63
64 utf *utf_java_lang_Throwable;
65 utf *utf_java_lang_VMThrowable;
66 utf *utf_java_lang_Error;
67 utf *utf_java_lang_Exception;
68 utf *utf_java_lang_NoClassDefFoundError;
69 utf *utf_java_lang_OutOfMemoryError;
70 utf *utf_java_lang_ClassNotFoundException;
71
72 utf* utf_java_lang_Void;
73 utf* utf_java_lang_Boolean;
74 utf* utf_java_lang_Byte;
75 utf* utf_java_lang_Character;
76 utf* utf_java_lang_Short;
77 utf* utf_java_lang_Integer;
78 utf* utf_java_lang_Long;
79 utf* utf_java_lang_Float;
80 utf* utf_java_lang_Double;
81
82 utf *utf_java_util_Vector;
83 utf *utf_java_lang_reflect_Constructor;
84 utf *utf_java_lang_reflect_Method;
85
86
87 utf *utf_InnerClasses;                  /* InnerClasses                       */
88 utf *utf_ConstantValue;                 /* ConstantValue                      */
89 utf *utf_Code;                          /* Code                               */
90 utf *utf_Exceptions;                    /* Exceptions                         */
91 utf *utf_LineNumberTable;               /* LineNumberTable                    */
92 utf *utf_SourceFile;                    /* SourceFile                         */
93
94 utf *utf_init;                          /* <init>                             */
95 utf *utf_clinit;                        /* <clinit>                           */
96 utf *utf_finalize;                      /* finalize                           */
97
98 utf *utf_printStackTrace;
99 utf *utf_fillInStackTrace;
100 utf *utf_loadClass;
101
102 utf *utf_void__void;                    /* ()V                                */
103 utf *utf_boolean__void;                 /* (Z)V                               */
104 utf *utf_byte__void;                    /* (B)V                               */
105 utf *utf_char__void;                    /* (C)V                               */
106 utf *utf_short__void;                   /* (S)V                               */
107 utf *utf_int__void;                     /* (I)V                               */
108 utf *utf_long__void;                    /* (J)V                               */
109 utf *utf_float__void;                   /* (F)V                               */
110 utf *utf_double__void;                  /* (D)V                               */
111 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
112 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
113 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
114 utf *utf_java_lang_String__java_lang_Class;
115 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
116
117 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
118
119 utf *array_packagename;
120
121
122 /* utf_init ********************************************************************
123
124    Initializes the utf8 subsystem.
125
126 *******************************************************************************/
127
128 void utf8_init(void)
129 {
130         /* create utf-symbols for pointer comparison of frequently used strings */
131
132         utf_java_lang_Object           = utf_new_char("java/lang/Object");
133
134         utf_java_lang_Class            = utf_new_char("java/lang/Class");
135         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
136         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
137         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
138         utf_java_lang_String           = utf_new_char("java/lang/String");
139         utf_java_lang_System           = utf_new_char("java/lang/System");
140         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
141         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
142
143         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
144         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
145         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
146         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
147
148         utf_java_lang_NoClassDefFoundError =
149                 utf_new_char(string_java_lang_NoClassDefFoundError);
150
151         utf_java_lang_OutOfMemoryError =
152                 utf_new_char(string_java_lang_OutOfMemoryError);
153
154         utf_java_lang_ClassNotFoundException =
155                 utf_new_char(string_java_lang_ClassNotFoundException);
156
157         utf_java_lang_Void             = utf_new_char("java/lang/Void");
158         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
159         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
160         utf_java_lang_Character        = utf_new_char("java/lang/Character");
161         utf_java_lang_Short            = utf_new_char("java/lang/Short");
162         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
163         utf_java_lang_Long             = utf_new_char("java/lang/Long");
164         utf_java_lang_Float            = utf_new_char("java/lang/Float");
165         utf_java_lang_Double           = utf_new_char("java/lang/Double");
166
167         utf_java_util_Vector           = utf_new_char("java/util/Vector");
168         utf_java_lang_reflect_Constructor = utf_new_char("java/lang/reflect/Constructor");
169         utf_java_lang_reflect_Method      = utf_new_char("java/lang/reflect/Method");
170
171         utf_InnerClasses               = utf_new_char("InnerClasses");
172         utf_ConstantValue              = utf_new_char("ConstantValue");
173         utf_Code                       = utf_new_char("Code");
174         utf_Exceptions                 = utf_new_char("Exceptions");
175         utf_LineNumberTable            = utf_new_char("LineNumberTable");
176         utf_SourceFile                 = utf_new_char("SourceFile");
177
178         utf_init                           = utf_new_char("<init>");
179         utf_clinit                         = utf_new_char("<clinit>");
180         utf_finalize                   = utf_new_char("finalize");
181
182         utf_printStackTrace            = utf_new_char("printStackTrace");
183         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
184         utf_loadClass                  = utf_new_char("loadClass");
185
186         utf_void__void                 = utf_new_char("()V");
187         utf_boolean__void              = utf_new_char("(Z)V");
188         utf_byte__void                 = utf_new_char("(B)V");
189         utf_char__void                 = utf_new_char("(C)V");
190         utf_short__void                = utf_new_char("(S)V");
191         utf_int__void                  = utf_new_char("(I)V");
192         utf_long__void                 = utf_new_char("(J)V");
193         utf_float__void                = utf_new_char("(F)V");
194         utf_double__void               = utf_new_char("(D)V");
195         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
196         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
197         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
198
199         utf_java_lang_String__java_lang_Class =
200                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
201
202         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
203
204         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
205
206         array_packagename              = utf_new_char("\t<the array package>");
207 }
208
209
210 /* utf_hashkey *****************************************************************
211
212    The hashkey is computed from the utf-text by using up to 8
213    characters.  For utf-symbols longer than 15 characters 3 characters
214    are taken from the beginning and the end, 2 characters are taken
215    from the middle.
216
217 *******************************************************************************/
218
219 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
220 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
221
222 u4 utf_hashkey(const char *text, u4 length)
223 {
224         const char *start_pos = text;       /* pointer to utf text                */
225         u4 a;
226
227         switch (length) {
228         case 0: /* empty string */
229                 return 0;
230
231         case 1: return fbs(0);
232         case 2: return fbs(0) ^ nbs(3);
233         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
234         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
235         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
236         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
237         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
238         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
239
240         case 9:
241                 a = fbs(0);
242                 a ^= nbs(1);
243                 a ^= nbs(2);
244                 text++;
245                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
246
247         case 10:
248                 a = fbs(0);
249                 text++;
250                 a ^= nbs(2);
251                 a ^= nbs(3);
252                 a ^= nbs(4);
253                 text++;
254                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
255
256         case 11:
257                 a = fbs(0);
258                 text++;
259                 a ^= nbs(2);
260                 a ^= nbs(3);
261                 a ^= nbs(4);
262                 text++;
263                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
264
265         case 12:
266                 a = fbs(0);
267                 text += 2;
268                 a ^= nbs(2);
269                 a ^= nbs(3);
270                 text++;
271                 a ^= nbs(5);
272                 a ^= nbs(6);
273                 a ^= nbs(7);
274                 text++;
275                 return a ^ nbs(9) ^ nbs(10);
276
277         case 13:
278                 a = fbs(0);
279                 a ^= nbs(1);
280                 text++;
281                 a ^= nbs(3);
282                 a ^= nbs(4);
283                 text += 2;      
284                 a ^= nbs(7);
285                 a ^= nbs(8);
286                 text += 2;
287                 return a ^ nbs(9) ^ nbs(10);
288
289         case 14:
290                 a = fbs(0);
291                 text += 2;      
292                 a ^= nbs(3);
293                 a ^= nbs(4);
294                 text += 2;      
295                 a ^= nbs(7);
296                 a ^= nbs(8);
297                 text += 2;
298                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
299
300         case 15:
301                 a = fbs(0);
302                 text += 2;      
303                 a ^= nbs(3);
304                 a ^= nbs(4);
305                 text += 2;      
306                 a ^= nbs(7);
307                 a ^= nbs(8);
308                 text += 2;
309                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
310
311         default:  /* 3 characters from beginning */
312                 a = fbs(0);
313                 text += 2;
314                 a ^= nbs(3);
315                 a ^= nbs(4);
316
317                 /* 2 characters from middle */
318                 text = start_pos + (length / 2);
319                 a ^= fbs(5);
320                 text += 2;
321                 a ^= nbs(6);    
322
323                 /* 3 characters from end */
324                 text = start_pos + length - 4;
325
326                 a ^= fbs(7);
327                 text++;
328
329                 return a ^ nbs(10) ^ nbs(11);
330     }
331 }
332
333
334 /* utf_hashkey *****************************************************************
335
336    Compute the hashkey of a unicode string.
337
338 *******************************************************************************/
339
340 u4 unicode_hashkey(u2 *text, u2 len)
341 {
342         return utf_hashkey((char *) text, len);
343 }
344
345
346 /* utf_new *********************************************************************
347
348    Creates a new utf-symbol, the text of the symbol is passed as a
349    u1-array. The function searches the utf-hashtable for a utf-symbol
350    with this text. On success the element returned, otherwise a new
351    hashtable element is created.
352
353    If the number of entries in the hashtable exceeds twice the size of
354    the hashtable slots a reorganization of the hashtable is done and
355    the utf symbols are copied to a new hashtable with doubled size.
356
357 *******************************************************************************/
358
359 utf *utf_new_intern(const char *text, u2 length);
360
361 utf *utf_new(const char *text, u2 length)
362 {
363     utf *r;
364
365 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
366     tables_lock();
367 #endif
368
369     r = utf_new_intern(text, length);
370
371 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
372     tables_unlock();
373 #endif
374
375     return r;
376 }
377
378
379 utf *utf_new_intern(const char *text, u2 length)
380 {
381         u4 key;                             /* hashkey computed from utf-text     */
382         u4 slot;                            /* slot in hashtable                  */
383         utf *u;                             /* hashtable element                  */
384         u2 i;
385
386 #ifdef STATISTICS
387         if (opt_stat)
388                 count_utf_new++;
389 #endif
390
391         key  = utf_hashkey(text, length);
392         slot = key & (utf_hash.size - 1);
393         u    = utf_hash.ptr[slot];
394
395         /* search external hash chain for utf-symbol */
396         while (u) {
397                 if (u->blength == length) {
398
399                         /* compare text of hashtable elements */
400                         for (i = 0; i < length; i++)
401                                 if (text[i] != u->text[i]) goto nomatch;
402                         
403 #ifdef STATISTICS
404                         if (opt_stat)
405                                 count_utf_new_found++;
406 #endif
407
408                         /* symbol found in hashtable */
409                         return u;
410                 }
411         nomatch:
412                 u = u->hashlink; /* next element in external chain */
413         }
414
415 #ifdef STATISTICS
416         if (opt_stat)
417                 count_utf_len += sizeof(utf) + length;
418 #endif
419
420         /* location in hashtable found, create new utf element */
421         u = NEW(utf);
422         u->blength  = length;               /* length in bytes of utfstring       */
423         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
424         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
425         memcpy(u->text, text, length);      /* copy utf-text                      */
426         u->text[length] = '\0';
427         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
428
429         utf_hash.entries++;                 /* update number of entries           */
430
431         if (utf_hash.entries > (utf_hash.size * 2)) {
432
433         /* reorganization of hashtable, average length of 
434            the external chains is approx. 2                */  
435
436                 u4 i;
437                 utf *u;
438                 hashtable newhash; /* the new hashtable */
439
440                 /* create new hashtable, double the size */
441                 init_hashtable(&newhash, utf_hash.size * 2);
442                 newhash.entries = utf_hash.entries;
443
444 #ifdef STATISTICS
445                 if (opt_stat)
446                         count_utf_len += sizeof(utf*) * utf_hash.size;
447 #endif
448
449                 /* transfer elements to new hashtable */
450                 for (i = 0; i < utf_hash.size; i++) {
451                         u = (utf *) utf_hash.ptr[i];
452                         while (u) {
453                                 utf *nextu = u->hashlink;
454                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
455                                                 
456                                 u->hashlink = (utf *) newhash.ptr[slot];
457                                 newhash.ptr[slot] = u;
458
459                                 /* follow link in external hash chain */
460                                 u = nextu;
461                         }
462                 }
463         
464                 /* dispose old table */
465                 MFREE(utf_hash.ptr, void*, utf_hash.size);
466                 utf_hash = newhash;
467         }
468
469         return u;
470 }
471
472
473 /* utf_new_u2 ******************************************************************
474
475    Make utf symbol from u2 array, if isclassname is true '.' is
476    replaced by '/'.
477
478 *******************************************************************************/
479
480 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
481 {
482         char *buffer;                   /* memory buffer for  unicode characters  */
483         char *pos;                      /* pointer to current position in buffer  */
484         u4 left;                        /* unicode characters left                */
485         u4 buflength;                   /* utf length in bytes of the u2 array    */
486         utf *result;                    /* resulting utf-string                   */
487         int i;          
488
489         /* determine utf length in bytes and allocate memory */
490
491         buflength = u2_utflength(unicode_pos, unicode_length); 
492         buffer    = MNEW(char, buflength);
493  
494         left = buflength;
495         pos  = buffer;
496
497         for (i = 0; i++ < unicode_length; unicode_pos++) {
498                 /* next unicode character */
499                 u2 c = *unicode_pos;
500                 
501                 if ((c != 0) && (c < 0x80)) {
502                         /* 1 character */       
503                         left--;
504                 if ((int) left < 0) break;
505                         /* convert classname */
506                         if (isclassname && c == '.')
507                                 *pos++ = '/';
508                         else
509                                 *pos++ = (char) c;
510
511                 } else if (c < 0x800) {             
512                         /* 2 characters */                              
513                 unsigned char high = c >> 6;
514                 unsigned char low  = c & 0x3F;
515                         left = left - 2;
516                 if ((int) left < 0) break;
517                 *pos++ = high | 0xC0; 
518                 *pos++ = low  | 0x80;     
519
520                 } else {         
521                 /* 3 characters */                              
522                 char low  = c & 0x3f;
523                 char mid  = (c >> 6) & 0x3F;
524                 char high = c >> 12;
525                         left = left - 3;
526                 if ((int) left < 0) break;
527                 *pos++ = high | 0xE0; 
528                 *pos++ = mid  | 0x80;  
529                 *pos++ = low  | 0x80;   
530                 }
531         }
532         
533         /* insert utf-string into symbol-table */
534         result = utf_new(buffer,buflength);
535
536         MFREE(buffer, char, buflength);
537
538         return result;
539 }
540
541
542 /* utf_new_char ****************************************************************
543
544    Creates a new utf symbol, the text for this symbol is passed as a
545    c-string ( = char* ).
546
547 *******************************************************************************/
548
549 utf *utf_new_char(const char *text)
550 {
551         return utf_new(text, strlen(text));
552 }
553
554
555 /* utf_new_char_classname ******************************************************
556
557    Creates a new utf symbol, the text for this symbol is passed as a
558    c-string ( = char* ) "." characters are going to be replaced by
559    "/". Since the above function is used often, this is a separte
560    function, instead of an if.
561
562 *******************************************************************************/
563
564 utf *utf_new_char_classname(const char *text)
565 {
566         if (strchr(text, '.')) {
567                 char *txt = strdup(text);
568                 char *end = txt + strlen(txt);
569                 char *c;
570                 utf *tmpRes;
571
572                 for (c = txt; c < end; c++)
573                         if (*c == '.') *c = '/';
574
575                 tmpRes = utf_new(txt, strlen(txt));
576                 FREE(txt, 0);
577
578                 return tmpRes;
579
580         } else
581                 return utf_new(text, strlen(text));
582 }
583
584
585 /* utf_nextu2 ******************************************************************
586
587    Read the next unicode character from the utf string and increment
588    the utf-string pointer accordingly.
589
590 *******************************************************************************/
591
592 u2 utf_nextu2(char **utf_ptr)
593 {
594     /* uncompressed unicode character */
595     u2 unicode_char = 0;
596     /* current position in utf text */  
597     unsigned char *utf = (unsigned char *) (*utf_ptr);
598     /* bytes representing the unicode character */
599     unsigned char ch1, ch2, ch3;
600     /* number of bytes used to represent the unicode character */
601     int len = 0;
602         
603     switch ((ch1 = utf[0]) >> 4) {
604         default: /* 1 byte */
605                 (*utf_ptr)++;
606                 return (u2) ch1;
607         case 0xC: 
608         case 0xD: /* 2 bytes */
609                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
610                         unsigned char high = ch1 & 0x1F;
611                         unsigned char low  = ch2 & 0x3F;
612                         unicode_char = (high << 6) + low;
613                         len = 2;
614                 }
615                 break;
616
617         case 0xE: /* 2 or 3 bytes */
618                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
619                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
620                                 unsigned char low  = ch3 & 0x3f;
621                                 unsigned char mid  = ch2 & 0x3f;
622                                 unsigned char high = ch1 & 0x0f;
623                                 unicode_char = (((high << 6) + mid) << 6) + low;
624                                 len = 3;
625                         } else
626                                 len = 2;                                           
627                 }
628                 break;
629     }
630
631     /* update position in utf-text */
632     *utf_ptr = (char *) (utf + len);
633
634     return unicode_char;
635 }
636
637
638 /* utf_strlen ******************************************************************
639
640    Determine number of unicode characters in the utf string.
641
642 *******************************************************************************/
643
644 u4 utf_strlen(utf *u)
645 {
646         char *endpos;                       /* points behind utf string           */
647         char *utf_ptr;                      /* current position in utf text       */
648         u4 len = 0;                         /* number of unicode characters       */
649
650         if (!u) {
651                 *exceptionptr = new_nullpointerexception();
652                 return 0;
653         }
654
655         endpos = utf_end(u);
656         utf_ptr = u->text;
657
658         while (utf_ptr < endpos) {
659                 len++;
660                 /* next unicode character */
661                 utf_nextu2(&utf_ptr);
662         }
663
664         if (utf_ptr != endpos)
665                 /* string ended abruptly */
666                 throw_cacao_exception_exit(string_java_lang_InternalError,
667                                                                    "Illegal utf8 string");
668
669         return len;
670 }
671
672
673 /* u2_utflength ****************************************************************
674
675    Returns the utf length in bytes of a u2 array.
676
677 *******************************************************************************/
678
679 u4 u2_utflength(u2 *text, u4 u2_length)
680 {
681         u4 result_len = 0;                  /* utf length in bytes                */
682         u2 ch;                              /* current unicode character          */
683         u4 len;
684         
685         for (len = 0; len < u2_length; len++) {
686                 /* next unicode character */
687                 ch = *text++;
688           
689                 /* determine bytes required to store unicode character as utf */
690                 if (ch && (ch < 0x80)) 
691                         result_len++;
692                 else if (ch < 0x800)
693                         result_len += 2;        
694                 else 
695                         result_len += 3;        
696         }
697
698     return result_len;
699 }
700
701
702 /* utf_display *****************************************************************
703
704    Write utf symbol to stdout (for debugging purposes).
705
706 *******************************************************************************/
707
708 void utf_display(utf *u)
709 {
710         char *endpos;                       /* points behind utf string           */
711         char *utf_ptr;                      /* current position in utf text       */
712
713         if (!u) {
714                 printf("NULL");
715                 fflush(stdout);
716                 return;
717         }
718
719         endpos = utf_end(u);
720         utf_ptr = u->text;
721
722         while (utf_ptr < endpos) {
723                 /* read next unicode character */                
724                 u2 c = utf_nextu2(&utf_ptr);
725                 if (c >= 32 && c <= 127) printf("%c", c);
726                 else printf("?");
727         }
728
729         fflush(stdout);
730 }
731
732
733 /* utf_display_classname *******************************************************
734
735    Write utf symbol to stdout with `/' converted to `.' (for debugging
736    purposes).
737
738 *******************************************************************************/
739
740 void utf_display_classname(utf *u)
741 {
742         char *endpos;                       /* points behind utf string           */
743         char *utf_ptr;                      /* current position in utf text       */
744
745         if (!u) {
746                 printf("NULL");
747                 fflush(stdout);
748                 return;
749         }
750
751         endpos = utf_end(u);
752         utf_ptr = u->text;
753
754         while (utf_ptr < endpos) {
755                 /* read next unicode character */                
756                 u2 c = utf_nextu2(&utf_ptr);
757                 if (c == '/') c = '.';
758                 if (c >= 32 && c <= 127) printf("%c", c);
759                 else printf("?");
760         }
761
762         fflush(stdout);
763 }
764
765
766 /* utf_sprint ******************************************************************
767         
768    Write utf symbol into c-string (for debugging purposes).
769
770 *******************************************************************************/
771
772 void utf_sprint(char *buffer, utf *u)
773 {
774         char *endpos;                       /* points behind utf string           */
775         char *utf_ptr;                      /* current position in utf text       */
776         u2 pos = 0;                         /* position in c-string               */
777
778         if (!u) {
779                 strcpy(buffer, "NULL");
780                 return;
781         }
782
783         endpos = utf_end(u);
784         utf_ptr = u->text;
785
786         while (utf_ptr < endpos) 
787                 /* copy next unicode character */       
788                 buffer[pos++] = utf_nextu2(&utf_ptr);
789
790         /* terminate string */
791         buffer[pos] = '\0';
792 }
793
794
795 /* utf_sprint_classname ********************************************************
796         
797    Write utf symbol into c-string with `/' converted to `.' (for debugging
798    purposes).
799
800 *******************************************************************************/
801
802 void utf_sprint_classname(char *buffer, utf *u)
803 {
804         char *endpos;                       /* points behind utf string           */
805         char *utf_ptr;                      /* current position in utf text       */
806         u2 pos = 0;                         /* position in c-string               */
807
808         if (!u) {
809                 strcpy(buffer, "NULL");
810                 return;
811         }
812
813         endpos = utf_end(u);
814         utf_ptr = u->text;
815
816         while (utf_ptr < endpos) {
817                 /* copy next unicode character */       
818                 u2 c = utf_nextu2(&utf_ptr);
819                 if (c == '/') c = '.';
820                 buffer[pos++] = c;
821         }
822
823         /* terminate string */
824         buffer[pos] = '\0';
825 }
826
827
828 /* utf_strcat ******************************************************************
829         
830    Like libc strcat, but uses an utf8 string.
831
832 *******************************************************************************/
833
834 void utf_strcat(char *buffer, utf *u)
835 {
836         utf_sprint(buffer + strlen(buffer), u);
837 }
838
839
840 /* utf_strcat_classname ********************************************************
841         
842    Like libc strcat, but uses an utf8 string.
843
844 *******************************************************************************/
845
846 void utf_strcat_classname(char *buffer, utf *u)
847 {
848         utf_sprint_classname(buffer + strlen(buffer), u);
849 }
850
851
852 /* utf_fprint ******************************************************************
853         
854    Write utf symbol into file.
855
856 *******************************************************************************/
857
858 void utf_fprint(FILE *file, utf *u)
859 {
860         char *endpos;                       /* points behind utf string           */
861         char *utf_ptr;                      /* current position in utf text       */
862
863         if (!u)
864                 return;
865
866         endpos = utf_end(u);
867         utf_ptr = u->text;
868
869         while (utf_ptr < endpos) { 
870                 /* read next unicode character */                
871                 u2 c = utf_nextu2(&utf_ptr);                            
872
873                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
874                 else fprintf(file, "?");
875         }
876 }
877
878
879 /* utf_fprint_classname ********************************************************
880         
881    Write utf symbol into file with `/' converted to `.'.
882
883 *******************************************************************************/
884
885 void utf_fprint_classname(FILE *file, utf *u)
886 {
887         char *endpos;                       /* points behind utf string           */
888         char *utf_ptr;                      /* current position in utf text       */
889
890     if (!u)
891                 return;
892
893         endpos = utf_end(u);
894         utf_ptr = u->text;
895
896         while (utf_ptr < endpos) { 
897                 /* read next unicode character */                
898                 u2 c = utf_nextu2(&utf_ptr);                            
899                 if (c == '/') c = '.';
900
901                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
902                 else fprintf(file, "?");
903         }
904 }
905
906
907 /* is_valid_utf ****************************************************************
908
909    Return true if the given string is a valid UTF-8 string.
910
911    utf_ptr...points to first character
912    end_pos...points after last character
913
914 *******************************************************************************/
915
916 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
917
918 bool is_valid_utf(char *utf_ptr, char *end_pos)
919 {
920         int bytes;
921         int len,i;
922         char c;
923         unsigned long v;
924
925         if (end_pos < utf_ptr) return false;
926         bytes = end_pos - utf_ptr;
927         while (bytes--) {
928                 c = *utf_ptr++;
929
930                 if (!c) return false;                     /* 0x00 is not allowed */
931                 if ((c & 0x80) == 0) continue;            /* ASCII */
932
933                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
934                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
935                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
936                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
937                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
938                 else return false;                        /* invalid leading byte */
939
940                 if (len > 2) return false;                /* Java limitation */
941
942                 v = (unsigned long)c & (0x3f >> len);
943                 
944                 if ((bytes -= len) < 0) return false;     /* missing bytes */
945
946                 for (i = len; i--; ) {
947                         c = *utf_ptr++;
948                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
949                                 return false;
950                         v = (v << 6) | (c & 0x3f);
951                 }
952
953                 if (v == 0) {
954                         if (len != 1) return false;           /* Java special */
955
956                 } else {
957                         /* Sun Java seems to allow overlong UTF-8 encodings */
958                         
959                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
960                                 if (!opt_liberalutf)
961                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
962                                 /* XXX change this to panic? */
963                         }
964                 }
965
966                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
967                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
968
969                 /* even these seem to be allowed */
970                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
971         }
972
973         return true;
974 }
975
976
977 /* is_valid_name ***************************************************************
978
979    Return true if the given string may be used as a class/field/method
980    name. (Currently this only disallows empty strings and control
981    characters.)
982
983    NOTE: The string is assumed to have passed is_valid_utf!
984
985    utf_ptr...points to first character
986    end_pos...points after last character
987
988 *******************************************************************************/
989
990 bool is_valid_name(char *utf_ptr, char *end_pos)
991 {
992         if (end_pos <= utf_ptr) return false; /* disallow empty names */
993
994         while (utf_ptr < end_pos) {
995                 unsigned char c = *utf_ptr++;
996
997                 if (c < 0x20) return false; /* disallow control characters */
998                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
999                         return false;
1000         }
1001
1002         return true;
1003 }
1004
1005 bool is_valid_name_utf(utf *u)
1006 {
1007         return is_valid_name(u->text,utf_end(u));
1008 }
1009
1010
1011 /* utf_show ********************************************************************
1012
1013    Writes the utf symbols in the utfhash to stdout and displays the
1014    number of external hash chains grouped according to the chainlength
1015    (for debugging purposes).
1016
1017 *******************************************************************************/
1018
1019 void utf_show(void)
1020 {
1021
1022 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1023
1024         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1025         u4 max_chainlength = 0;      /* maximum length of the chains */
1026         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1027         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1028         u4 i;
1029
1030         printf ("UTF-HASH:\n");
1031
1032         /* show element of utf-hashtable */
1033         for (i=0; i<utf_hash.size; i++) {
1034                 utf *u = utf_hash.ptr[i];
1035                 if (u) {
1036                         printf ("SLOT %d: ", (int) i);
1037                         while (u) {
1038                                 printf ("'");
1039                                 utf_display (u);
1040                                 printf ("' ");
1041                                 u = u->hashlink;
1042                         }       
1043                         printf ("\n");
1044                 }
1045                 
1046         }
1047
1048         printf ("UTF-HASH: %d slots for %d entries\n", 
1049                         (int) utf_hash.size, (int) utf_hash.entries );
1050
1051
1052         if (utf_hash.entries == 0)
1053                 return;
1054
1055         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1056
1057         for (i=0;i<CHAIN_LIMIT;i++)
1058                 chain_count[i]=0;
1059
1060         /* count numbers of hashchains according to their length */
1061         for (i=0; i<utf_hash.size; i++) {
1062                   
1063                 utf *u = (utf*) utf_hash.ptr[i];
1064                 u4 chain_length = 0;
1065
1066                 /* determine chainlength */
1067                 while (u) {
1068                         u = u->hashlink;
1069                         chain_length++;
1070                 }
1071
1072                 /* update sum of all chainlengths */
1073                 sum_chainlength+=chain_length;
1074
1075                 /* determine the maximum length of the chains */
1076                 if (chain_length>max_chainlength)
1077                         max_chainlength = chain_length;
1078
1079                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1080                 if (chain_length>=CHAIN_LIMIT) {
1081                         beyond_limit+=chain_length;
1082                         chain_length=CHAIN_LIMIT-1;
1083                 }
1084
1085                 /* update number of hashchains of current length */
1086                 chain_count[chain_length]++;
1087         }
1088
1089         /* display results */  
1090         for (i=1;i<CHAIN_LIMIT-1;i++) 
1091                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1092           
1093         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1094
1095
1096         printf("max. chainlength:%5d\n",max_chainlength);
1097
1098         /* avg. chainlength = sum of chainlengths / number of chains */
1099         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1100 }
1101
1102
1103 /*
1104  * These are local overrides for various environment variables in Emacs.
1105  * Please do not remove this and leave it at the end of the file, where
1106  * Emacs will automagically detect them.
1107  * ---------------------------------------------------------------------
1108  * Local variables:
1109  * mode: c
1110  * indent-tabs-mode: t
1111  * c-basic-offset: 4
1112  * tab-width: 4
1113  * End:
1114  */