Lazy checkcast and instanceof.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2158 2005-03-30 20:06:37Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/tables.h"
45 #include "vm/utf8.h"
46
47
48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
49
50
51 /* utf-symbols for pointer comparison of frequently used strings **************/
52
53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
54
55 utf *utf_java_lang_Class;
56 utf *utf_java_lang_ClassLoader;
57 utf *utf_java_lang_Cloneable;
58 utf *utf_java_lang_SecurityManager;
59 utf *utf_java_lang_String;
60 utf *utf_java_lang_System;
61 utf *utf_java_io_Serializable;
62
63 utf *utf_java_lang_Throwable;
64 utf *utf_java_lang_VMThrowable;
65 utf *utf_java_lang_Exception;
66 utf *utf_java_lang_Error;
67 utf *utf_java_lang_OutOfMemoryError;
68 utf *utf_java_lang_NoClassDefFoundError;
69
70 utf* utf_java_lang_Void;
71 utf* utf_java_lang_Boolean;
72 utf* utf_java_lang_Byte;
73 utf* utf_java_lang_Character;
74 utf* utf_java_lang_Short;
75 utf* utf_java_lang_Integer;
76 utf* utf_java_lang_Long;
77 utf* utf_java_lang_Float;
78 utf* utf_java_lang_Double;
79
80 utf *utf_java_util_Vector;
81
82 utf *utf_InnerClasses;                  /* InnerClasses                       */
83 utf *utf_ConstantValue;                 /* ConstantValue                      */
84 utf *utf_Code;                          /* Code                               */
85 utf *utf_Exceptions;                    /* Exceptions                         */
86 utf *utf_LineNumberTable;               /* LineNumberTable                    */
87 utf *utf_SourceFile;                    /* SourceFile                         */
88
89 utf *utf_init;                          /* <init>                             */
90 utf *utf_clinit;                        /* <clinit>                           */
91 utf *utf_finalize;                      /* finalize                           */
92
93 utf *utf_printStackTrace;
94 utf *utf_fillInStackTrace;
95 utf *utf_loadClass;
96
97 utf *utf_void__void;                    /* ()V                                */
98 utf *utf_boolean__void;                 /* (Z)V                               */
99 utf *utf_byte__void;                    /* (B)V                               */
100 utf *utf_char__void;                    /* (C)V                               */
101 utf *utf_short__void;                   /* (S)V                               */
102 utf *utf_int__void;                     /* (I)V                               */
103 utf *utf_long__void;                    /* (J)V                               */
104 utf *utf_float__void;                   /* (F)V                               */
105 utf *utf_double__void;                  /* (D)V                               */
106 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
107 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
108 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
109 utf *utf_java_lang_String__java_lang_Class;
110 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
111
112 utf *array_packagename;
113
114
115 /* utf_init ********************************************************************
116
117    Initializes the utf8 subsystem.
118
119 *******************************************************************************/
120
121 void utf8_init(void)
122 {
123         /* create utf-symbols for pointer comparison of frequently used strings */
124
125         utf_java_lang_Object           = utf_new_char("java/lang/Object");
126
127         utf_java_lang_Class            = utf_new_char("java/lang/Class");
128         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
129         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
130         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
131         utf_java_lang_String           = utf_new_char("java/lang/String");
132         utf_java_lang_System           = utf_new_char("java/lang/System");
133         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
134
135         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
136         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
137         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
138         utf_java_lang_Error            = utf_new_char("java/lang/Error");
139         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
140
141         utf_java_lang_NoClassDefFoundError =
142                 utf_new_char(string_java_lang_NoClassDefFoundError);
143
144         utf_java_lang_Void             = utf_new_char("java/lang/Void");
145         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
146         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
147         utf_java_lang_Character        = utf_new_char("java/lang/Character");
148         utf_java_lang_Short            = utf_new_char("java/lang/Short");
149         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
150         utf_java_lang_Long             = utf_new_char("java/lang/Long");
151         utf_java_lang_Float            = utf_new_char("java/lang/Float");
152         utf_java_lang_Double           = utf_new_char("java/lang/Double");
153
154         utf_java_util_Vector           = utf_new_char("java/util/Vector");
155
156         utf_InnerClasses               = utf_new_char("InnerClasses");
157         utf_ConstantValue              = utf_new_char("ConstantValue");
158         utf_Code                       = utf_new_char("Code");
159         utf_Exceptions                 = utf_new_char("Exceptions");
160         utf_LineNumberTable            = utf_new_char("LineNumberTable");
161         utf_SourceFile                 = utf_new_char("SourceFile");
162
163         utf_init                           = utf_new_char("<init>");
164         utf_clinit                         = utf_new_char("<clinit>");
165         utf_finalize                   = utf_new_char("finalize");
166
167         utf_printStackTrace            = utf_new_char("printStackTrace");
168         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
169         utf_loadClass                  = utf_new_char("loadClass");
170
171         utf_void__void                 = utf_new_char("()V");
172         utf_boolean__void              = utf_new_char("(Z)V");
173         utf_byte__void                 = utf_new_char("(B)V");
174         utf_char__void                 = utf_new_char("(C)V");
175         utf_short__void                = utf_new_char("(S)V");
176         utf_int__void                  = utf_new_char("(I)V");
177         utf_long__void                 = utf_new_char("(J)V");
178         utf_float__void                = utf_new_char("(F)V");
179         utf_double__void               = utf_new_char("(D)V");
180         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
181         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
182         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
183
184         utf_java_lang_String__java_lang_Class =
185                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
186
187         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
188
189         array_packagename              = utf_new_char("\t<the array package>");
190 }
191
192
193 /* utf_hashkey *****************************************************************
194
195    The hashkey is computed from the utf-text by using up to 8
196    characters.  For utf-symbols longer than 15 characters 3 characters
197    are taken from the beginning and the end, 2 characters are taken
198    from the middle.
199
200 *******************************************************************************/
201
202 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
203 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
204
205 u4 utf_hashkey(const char *text, u4 length)
206 {
207         const char *start_pos = text;       /* pointer to utf text                */
208         u4 a;
209
210         switch (length) {
211         case 0: /* empty string */
212                 return 0;
213
214         case 1: return fbs(0);
215         case 2: return fbs(0) ^ nbs(3);
216         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
217         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
218         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
219         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
220         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
221         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
222
223         case 9:
224                 a = fbs(0);
225                 a ^= nbs(1);
226                 a ^= nbs(2);
227                 text++;
228                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
229
230         case 10:
231                 a = fbs(0);
232                 text++;
233                 a ^= nbs(2);
234                 a ^= nbs(3);
235                 a ^= nbs(4);
236                 text++;
237                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
238
239         case 11:
240                 a = fbs(0);
241                 text++;
242                 a ^= nbs(2);
243                 a ^= nbs(3);
244                 a ^= nbs(4);
245                 text++;
246                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
247
248         case 12:
249                 a = fbs(0);
250                 text += 2;
251                 a ^= nbs(2);
252                 a ^= nbs(3);
253                 text++;
254                 a ^= nbs(5);
255                 a ^= nbs(6);
256                 a ^= nbs(7);
257                 text++;
258                 return a ^ nbs(9) ^ nbs(10);
259
260         case 13:
261                 a = fbs(0);
262                 a ^= nbs(1);
263                 text++;
264                 a ^= nbs(3);
265                 a ^= nbs(4);
266                 text += 2;      
267                 a ^= nbs(7);
268                 a ^= nbs(8);
269                 text += 2;
270                 return a ^ nbs(9) ^ nbs(10);
271
272         case 14:
273                 a = fbs(0);
274                 text += 2;      
275                 a ^= nbs(3);
276                 a ^= nbs(4);
277                 text += 2;      
278                 a ^= nbs(7);
279                 a ^= nbs(8);
280                 text += 2;
281                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
282
283         case 15:
284                 a = fbs(0);
285                 text += 2;      
286                 a ^= nbs(3);
287                 a ^= nbs(4);
288                 text += 2;      
289                 a ^= nbs(7);
290                 a ^= nbs(8);
291                 text += 2;
292                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
293
294         default:  /* 3 characters from beginning */
295                 a = fbs(0);
296                 text += 2;
297                 a ^= nbs(3);
298                 a ^= nbs(4);
299
300                 /* 2 characters from middle */
301                 text = start_pos + (length / 2);
302                 a ^= fbs(5);
303                 text += 2;
304                 a ^= nbs(6);    
305
306                 /* 3 characters from end */
307                 text = start_pos + length - 4;
308
309                 a ^= fbs(7);
310                 text++;
311
312                 return a ^ nbs(10) ^ nbs(11);
313     }
314 }
315
316
317 /* utf_hashkey *****************************************************************
318
319    Compute the hashkey of a unicode string.
320
321 *******************************************************************************/
322
323 u4 unicode_hashkey(u2 *text, u2 len)
324 {
325         return utf_hashkey((char *) text, len);
326 }
327
328
329 /* utf_new *********************************************************************
330
331    Creates a new utf-symbol, the text of the symbol is passed as a
332    u1-array. The function searches the utf-hashtable for a utf-symbol
333    with this text. On success the element returned, otherwise a new
334    hashtable element is created.
335
336    If the number of entries in the hashtable exceeds twice the size of
337    the hashtable slots a reorganization of the hashtable is done and
338    the utf symbols are copied to a new hashtable with doubled size.
339
340 *******************************************************************************/
341
342 utf *utf_new_intern(const char *text, u2 length);
343
344 utf *utf_new(const char *text, u2 length)
345 {
346     utf *r;
347
348 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
349     tables_lock();
350 #endif
351
352     r = utf_new_intern(text, length);
353
354 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
355     tables_unlock();
356 #endif
357
358     return r;
359 }
360
361
362 utf *utf_new_intern(const char *text, u2 length)
363 {
364         u4 key;                             /* hashkey computed from utf-text     */
365         u4 slot;                            /* slot in hashtable                  */
366         utf *u;                             /* hashtable element                  */
367         u2 i;
368
369 #ifdef STATISTICS
370         if (opt_stat)
371                 count_utf_new++;
372 #endif
373
374         key  = utf_hashkey(text, length);
375         slot = key & (utf_hash.size - 1);
376         u    = utf_hash.ptr[slot];
377
378         /* search external hash chain for utf-symbol */
379         while (u) {
380                 if (u->blength == length) {
381
382                         /* compare text of hashtable elements */
383                         for (i = 0; i < length; i++)
384                                 if (text[i] != u->text[i]) goto nomatch;
385                         
386 #ifdef STATISTICS
387                         if (opt_stat)
388                                 count_utf_new_found++;
389 #endif
390
391                         /* symbol found in hashtable */
392                         return u;
393                 }
394         nomatch:
395                 u = u->hashlink; /* next element in external chain */
396         }
397
398 #ifdef STATISTICS
399         if (opt_stat)
400                 count_utf_len += sizeof(utf) + length;
401 #endif
402
403         /* location in hashtable found, create new utf element */
404         u = NEW(utf);
405         u->blength  = length;               /* length in bytes of utfstring       */
406         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
407         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
408         memcpy(u->text, text, length);      /* copy utf-text                      */
409         u->text[length] = '\0';
410         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
411
412         utf_hash.entries++;                 /* update number of entries           */
413
414         if (utf_hash.entries > (utf_hash.size * 2)) {
415
416         /* reorganization of hashtable, average length of 
417            the external chains is approx. 2                */  
418
419                 u4 i;
420                 utf *u;
421                 hashtable newhash; /* the new hashtable */
422
423                 /* create new hashtable, double the size */
424                 init_hashtable(&newhash, utf_hash.size * 2);
425                 newhash.entries = utf_hash.entries;
426
427 #ifdef STATISTICS
428                 if (opt_stat)
429                         count_utf_len += sizeof(utf*) * utf_hash.size;
430 #endif
431
432                 /* transfer elements to new hashtable */
433                 for (i = 0; i < utf_hash.size; i++) {
434                         u = (utf *) utf_hash.ptr[i];
435                         while (u) {
436                                 utf *nextu = u->hashlink;
437                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
438                                                 
439                                 u->hashlink = (utf *) newhash.ptr[slot];
440                                 newhash.ptr[slot] = u;
441
442                                 /* follow link in external hash chain */
443                                 u = nextu;
444                         }
445                 }
446         
447                 /* dispose old table */
448                 MFREE(utf_hash.ptr, void*, utf_hash.size);
449                 utf_hash = newhash;
450         }
451
452         return u;
453 }
454
455
456 /* utf_new_u2 ******************************************************************
457
458    Make utf symbol from u2 array, if isclassname is true '.' is
459    replaced by '/'.
460
461 *******************************************************************************/
462
463 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
464 {
465         char *buffer;                   /* memory buffer for  unicode characters  */
466         char *pos;                      /* pointer to current position in buffer  */
467         u4 left;                        /* unicode characters left                */
468         u4 buflength;                   /* utf length in bytes of the u2 array    */
469         utf *result;                    /* resulting utf-string                   */
470         int i;          
471
472         /* determine utf length in bytes and allocate memory */
473
474         buflength = u2_utflength(unicode_pos, unicode_length); 
475         buffer    = MNEW(char, buflength);
476  
477         left = buflength;
478         pos  = buffer;
479
480         for (i = 0; i++ < unicode_length; unicode_pos++) {
481                 /* next unicode character */
482                 u2 c = *unicode_pos;
483                 
484                 if ((c != 0) && (c < 0x80)) {
485                         /* 1 character */       
486                         left--;
487                 if ((int) left < 0) break;
488                         /* convert classname */
489                         if (isclassname && c == '.')
490                                 *pos++ = '/';
491                         else
492                                 *pos++ = (char) c;
493
494                 } else if (c < 0x800) {             
495                         /* 2 characters */                              
496                 unsigned char high = c >> 6;
497                 unsigned char low  = c & 0x3F;
498                         left = left - 2;
499                 if ((int) left < 0) break;
500                 *pos++ = high | 0xC0; 
501                 *pos++ = low  | 0x80;     
502
503                 } else {         
504                 /* 3 characters */                              
505                 char low  = c & 0x3f;
506                 char mid  = (c >> 6) & 0x3F;
507                 char high = c >> 12;
508                         left = left - 3;
509                 if ((int) left < 0) break;
510                 *pos++ = high | 0xE0; 
511                 *pos++ = mid  | 0x80;  
512                 *pos++ = low  | 0x80;   
513                 }
514         }
515         
516         /* insert utf-string into symbol-table */
517         result = utf_new(buffer,buflength);
518
519         MFREE(buffer, char, buflength);
520
521         return result;
522 }
523
524
525 /* utf_new_char ****************************************************************
526
527    Creates a new utf symbol, the text for this symbol is passed as a
528    c-string ( = char* ).
529
530 *******************************************************************************/
531
532 utf *utf_new_char(const char *text)
533 {
534         return utf_new(text, strlen(text));
535 }
536
537
538 /* utf_new_char_classname ******************************************************
539
540    Creates a new utf symbol, the text for this symbol is passed as a
541    c-string ( = char* ) "." characters are going to be replaced by
542    "/". Since the above function is used often, this is a separte
543    function, instead of an if.
544
545 *******************************************************************************/
546
547 utf *utf_new_char_classname(const char *text)
548 {
549         if (strchr(text, '.')) {
550                 char *txt = strdup(text);
551                 char *end = txt + strlen(txt);
552                 char *c;
553                 utf *tmpRes;
554
555                 for (c = txt; c < end; c++)
556                         if (*c == '.') *c = '/';
557
558                 tmpRes = utf_new(txt, strlen(txt));
559                 FREE(txt, 0);
560
561                 return tmpRes;
562
563         } else
564                 return utf_new(text, strlen(text));
565 }
566
567
568 /* utf_nextu2 ******************************************************************
569
570    Read the next unicode character from the utf string and increment
571    the utf-string pointer accordingly.
572
573 *******************************************************************************/
574
575 u2 utf_nextu2(char **utf_ptr)
576 {
577     /* uncompressed unicode character */
578     u2 unicode_char = 0;
579     /* current position in utf text */  
580     unsigned char *utf = (unsigned char *) (*utf_ptr);
581     /* bytes representing the unicode character */
582     unsigned char ch1, ch2, ch3;
583     /* number of bytes used to represent the unicode character */
584     int len = 0;
585         
586     switch ((ch1 = utf[0]) >> 4) {
587         default: /* 1 byte */
588                 (*utf_ptr)++;
589                 return (u2) ch1;
590         case 0xC: 
591         case 0xD: /* 2 bytes */
592                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
593                         unsigned char high = ch1 & 0x1F;
594                         unsigned char low  = ch2 & 0x3F;
595                         unicode_char = (high << 6) + low;
596                         len = 2;
597                 }
598                 break;
599
600         case 0xE: /* 2 or 3 bytes */
601                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
602                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
603                                 unsigned char low  = ch3 & 0x3f;
604                                 unsigned char mid  = ch2 & 0x3f;
605                                 unsigned char high = ch1 & 0x0f;
606                                 unicode_char = (((high << 6) + mid) << 6) + low;
607                                 len = 3;
608                         } else
609                                 len = 2;                                           
610                 }
611                 break;
612     }
613
614     /* update position in utf-text */
615     *utf_ptr = (char *) (utf + len);
616
617     return unicode_char;
618 }
619
620
621 /* utf_strlen ******************************************************************
622
623    Determine number of unicode characters in the utf string.
624
625 *******************************************************************************/
626
627 u4 utf_strlen(utf *u)
628 {
629         char *endpos;                       /* points behind utf string           */
630         char *utf_ptr;                      /* current position in utf text       */
631         u4 len = 0;                         /* number of unicode characters       */
632
633         if (!u) {
634                 *exceptionptr = new_nullpointerexception();
635                 return 0;
636         }
637
638         endpos = utf_end(u);
639         utf_ptr = u->text;
640
641         while (utf_ptr < endpos) {
642                 len++;
643                 /* next unicode character */
644                 utf_nextu2(&utf_ptr);
645         }
646
647         if (utf_ptr != endpos)
648                 /* string ended abruptly */
649                 throw_cacao_exception_exit(string_java_lang_InternalError,
650                                                                    "Illegal utf8 string");
651
652         return len;
653 }
654
655
656 /* u2_utflength ****************************************************************
657
658    Returns the utf length in bytes of a u2 array.
659
660 *******************************************************************************/
661
662 u4 u2_utflength(u2 *text, u4 u2_length)
663 {
664         u4 result_len = 0;                  /* utf length in bytes                */
665         u2 ch;                              /* current unicode character          */
666         u4 len;
667         
668         for (len = 0; len < u2_length; len++) {
669                 /* next unicode character */
670                 ch = *text++;
671           
672                 /* determine bytes required to store unicode character as utf */
673                 if (ch && (ch < 0x80)) 
674                         result_len++;
675                 else if (ch < 0x800)
676                         result_len += 2;        
677                 else 
678                         result_len += 3;        
679         }
680
681     return result_len;
682 }
683
684
685 /* utf_display *****************************************************************
686
687    Write utf symbol to stdout (for debugging purposes).
688
689 *******************************************************************************/
690
691 void utf_display(utf *u)
692 {
693         char *endpos;                       /* points behind utf string           */
694         char *utf_ptr;                      /* current position in utf text       */
695
696         if (!u) {
697                 printf("NULL");
698                 fflush(stdout);
699                 return;
700         }
701
702         endpos = utf_end(u);
703         utf_ptr = u->text;
704
705         while (utf_ptr < endpos) {
706                 /* read next unicode character */                
707                 u2 c = utf_nextu2(&utf_ptr);
708                 if (c >= 32 && c <= 127) printf("%c", c);
709                 else printf("?");
710         }
711
712         fflush(stdout);
713 }
714
715
716 /* utf_display_classname *******************************************************
717
718    Write utf symbol to stdout with `/' converted to `.' (for debugging
719    purposes).
720
721 *******************************************************************************/
722
723 void utf_display_classname(utf *u)
724 {
725         char *endpos;                       /* points behind utf string           */
726         char *utf_ptr;                      /* current position in utf text       */
727
728         if (!u) {
729                 printf("NULL");
730                 fflush(stdout);
731                 return;
732         }
733
734         endpos = utf_end(u);
735         utf_ptr = u->text;
736
737         while (utf_ptr < endpos) {
738                 /* read next unicode character */                
739                 u2 c = utf_nextu2(&utf_ptr);
740                 if (c == '/') c = '.';
741                 if (c >= 32 && c <= 127) printf("%c", c);
742                 else printf("?");
743         }
744
745         fflush(stdout);
746 }
747
748
749 /* utf_sprint ******************************************************************
750         
751    Write utf symbol into c-string (for debugging purposes).
752
753 *******************************************************************************/
754
755 void utf_sprint(char *buffer, utf *u)
756 {
757         char *endpos;                       /* points behind utf string           */
758         char *utf_ptr;                      /* current position in utf text       */
759         u2 pos = 0;                         /* position in c-string               */
760
761         if (!u) {
762                 strcpy(buffer, "NULL");
763                 return;
764         }
765
766         endpos = utf_end(u);
767         utf_ptr = u->text;
768
769         while (utf_ptr < endpos) 
770                 /* copy next unicode character */       
771                 buffer[pos++] = utf_nextu2(&utf_ptr);
772
773         /* terminate string */
774         buffer[pos] = '\0';
775 }
776
777
778 /* utf_sprint_classname ********************************************************
779         
780    Write utf symbol into c-string with `/' converted to `.' (for debugging
781    purposes).
782
783 *******************************************************************************/
784
785 void utf_sprint_classname(char *buffer, utf *u)
786 {
787         char *endpos;                       /* points behind utf string           */
788         char *utf_ptr;                      /* current position in utf text       */
789         u2 pos = 0;                         /* position in c-string               */
790
791         if (!u) {
792                 strcpy(buffer, "NULL");
793                 return;
794         }
795
796         endpos = utf_end(u);
797         utf_ptr = u->text;
798
799         while (utf_ptr < endpos) {
800                 /* copy next unicode character */       
801                 u2 c = utf_nextu2(&utf_ptr);
802                 if (c == '/') c = '.';
803                 buffer[pos++] = c;
804         }
805
806         /* terminate string */
807         buffer[pos] = '\0';
808 }
809
810
811 /* utf_strcat ******************************************************************
812         
813    Like libc strcat, but uses an utf8 string.
814
815 *******************************************************************************/
816
817 void utf_strcat(char *buffer, utf *u)
818 {
819         utf_sprint(buffer + strlen(buffer), u);
820 }
821
822
823 /* utf_strcat_classname ********************************************************
824         
825    Like libc strcat, but uses an utf8 string.
826
827 *******************************************************************************/
828
829 void utf_strcat_classname(char *buffer, utf *u)
830 {
831         utf_sprint_classname(buffer + strlen(buffer), u);
832 }
833
834
835 /* utf_fprint ******************************************************************
836         
837    Write utf symbol into file.
838
839 *******************************************************************************/
840
841 void utf_fprint(FILE *file, utf *u)
842 {
843         char *endpos;                       /* points behind utf string           */
844         char *utf_ptr;                      /* current position in utf text       */
845
846         if (!u)
847                 return;
848
849         endpos = utf_end(u);
850         utf_ptr = u->text;
851
852         while (utf_ptr < endpos) { 
853                 /* read next unicode character */                
854                 u2 c = utf_nextu2(&utf_ptr);                            
855
856                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
857                 else fprintf(file, "?");
858         }
859 }
860
861
862 /* utf_fprint_classname ********************************************************
863         
864    Write utf symbol into file with `/' converted to `.'.
865
866 *******************************************************************************/
867
868 void utf_fprint_classname(FILE *file, utf *u)
869 {
870         char *endpos;                       /* points behind utf string           */
871         char *utf_ptr;                      /* current position in utf text       */
872
873     if (!u)
874                 return;
875
876         endpos = utf_end(u);
877         utf_ptr = u->text;
878
879         while (utf_ptr < endpos) { 
880                 /* read next unicode character */                
881                 u2 c = utf_nextu2(&utf_ptr);                            
882                 if (c == '/') c = '.';
883
884                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
885                 else fprintf(file, "?");
886         }
887 }
888
889
890 /* is_valid_utf ****************************************************************
891
892    Return true if the given string is a valid UTF-8 string.
893
894    utf_ptr...points to first character
895    end_pos...points after last character
896
897 *******************************************************************************/
898
899 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
900
901 bool is_valid_utf(char *utf_ptr, char *end_pos)
902 {
903         int bytes;
904         int len,i;
905         char c;
906         unsigned long v;
907
908         if (end_pos < utf_ptr) return false;
909         bytes = end_pos - utf_ptr;
910         while (bytes--) {
911                 c = *utf_ptr++;
912
913                 if (!c) return false;                     /* 0x00 is not allowed */
914                 if ((c & 0x80) == 0) continue;            /* ASCII */
915
916                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
917                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
918                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
919                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
920                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
921                 else return false;                        /* invalid leading byte */
922
923                 if (len > 2) return false;                /* Java limitation */
924
925                 v = (unsigned long)c & (0x3f >> len);
926                 
927                 if ((bytes -= len) < 0) return false;     /* missing bytes */
928
929                 for (i = len; i--; ) {
930                         c = *utf_ptr++;
931                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
932                                 return false;
933                         v = (v << 6) | (c & 0x3f);
934                 }
935
936                 if (v == 0) {
937                         if (len != 1) return false;           /* Java special */
938
939                 } else {
940                         /* Sun Java seems to allow overlong UTF-8 encodings */
941                         
942                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
943                                 if (!opt_liberalutf)
944                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
945                                 /* XXX change this to panic? */
946                         }
947                 }
948
949                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
950                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
951
952                 /* even these seem to be allowed */
953                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
954         }
955
956         return true;
957 }
958
959
960 /* is_valid_name ***************************************************************
961
962    Return true if the given string may be used as a class/field/method
963    name. (Currently this only disallows empty strings and control
964    characters.)
965
966    NOTE: The string is assumed to have passed is_valid_utf!
967
968    utf_ptr...points to first character
969    end_pos...points after last character
970
971 *******************************************************************************/
972
973 bool is_valid_name(char *utf_ptr, char *end_pos)
974 {
975         if (end_pos <= utf_ptr) return false; /* disallow empty names */
976
977         while (utf_ptr < end_pos) {
978                 unsigned char c = *utf_ptr++;
979
980                 if (c < 0x20) return false; /* disallow control characters */
981                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
982                         return false;
983         }
984
985         return true;
986 }
987
988 bool is_valid_name_utf(utf *u)
989 {
990         return is_valid_name(u->text,utf_end(u));
991 }
992
993
994 /* utf_show ********************************************************************
995
996    Writes the utf symbols in the utfhash to stdout and displays the
997    number of external hash chains grouped according to the chainlength
998    (for debugging purposes).
999
1000 *******************************************************************************/
1001
1002 void utf_show(void)
1003 {
1004
1005 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1006
1007         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1008         u4 max_chainlength = 0;      /* maximum length of the chains */
1009         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1010         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1011         u4 i;
1012
1013         printf ("UTF-HASH:\n");
1014
1015         /* show element of utf-hashtable */
1016         for (i=0; i<utf_hash.size; i++) {
1017                 utf *u = utf_hash.ptr[i];
1018                 if (u) {
1019                         printf ("SLOT %d: ", (int) i);
1020                         while (u) {
1021                                 printf ("'");
1022                                 utf_display (u);
1023                                 printf ("' ");
1024                                 u = u->hashlink;
1025                         }       
1026                         printf ("\n");
1027                 }
1028                 
1029         }
1030
1031         printf ("UTF-HASH: %d slots for %d entries\n", 
1032                         (int) utf_hash.size, (int) utf_hash.entries );
1033
1034
1035         if (utf_hash.entries == 0)
1036                 return;
1037
1038         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1039
1040         for (i=0;i<CHAIN_LIMIT;i++)
1041                 chain_count[i]=0;
1042
1043         /* count numbers of hashchains according to their length */
1044         for (i=0; i<utf_hash.size; i++) {
1045                   
1046                 utf *u = (utf*) utf_hash.ptr[i];
1047                 u4 chain_length = 0;
1048
1049                 /* determine chainlength */
1050                 while (u) {
1051                         u = u->hashlink;
1052                         chain_length++;
1053                 }
1054
1055                 /* update sum of all chainlengths */
1056                 sum_chainlength+=chain_length;
1057
1058                 /* determine the maximum length of the chains */
1059                 if (chain_length>max_chainlength)
1060                         max_chainlength = chain_length;
1061
1062                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1063                 if (chain_length>=CHAIN_LIMIT) {
1064                         beyond_limit+=chain_length;
1065                         chain_length=CHAIN_LIMIT-1;
1066                 }
1067
1068                 /* update number of hashchains of current length */
1069                 chain_count[chain_length]++;
1070         }
1071
1072         /* display results */  
1073         for (i=1;i<CHAIN_LIMIT-1;i++) 
1074                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1075           
1076         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1077
1078
1079         printf("max. chainlength:%5d\n",max_chainlength);
1080
1081         /* avg. chainlength = sum of chainlengths / number of chains */
1082         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1083 }
1084
1085
1086 /*
1087  * These are local overrides for various environment variables in Emacs.
1088  * Please do not remove this and leave it at the end of the file, where
1089  * Emacs will automagically detect them.
1090  * ---------------------------------------------------------------------
1091  * Local variables:
1092  * mode: c
1093  * indent-tabs-mode: t
1094  * c-basic-offset: 4
1095  * tab-width: 4
1096  * End:
1097  */