Added: utf_java_lang_ClassNotFoundException
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2277 2005-04-12 19:48:03Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/tables.h"
45 #include "vm/utf8.h"
46
47
48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
49
50
51 /* utf-symbols for pointer comparison of frequently used strings **************/
52
53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
54
55 utf *utf_java_lang_Class;
56 utf *utf_java_lang_ClassLoader;
57 utf *utf_java_lang_Cloneable;
58 utf *utf_java_lang_SecurityManager;
59 utf *utf_java_lang_String;
60 utf *utf_java_lang_System;
61 utf *utf_java_io_Serializable;
62
63 utf *utf_java_lang_Throwable;
64 utf *utf_java_lang_VMThrowable;
65 utf *utf_java_lang_Error;
66 utf *utf_java_lang_Exception;
67 utf *utf_java_lang_NoClassDefFoundError;
68 utf *utf_java_lang_OutOfMemoryError;
69 utf *utf_java_lang_ClassNotFoundException;
70
71 utf* utf_java_lang_Void;
72 utf* utf_java_lang_Boolean;
73 utf* utf_java_lang_Byte;
74 utf* utf_java_lang_Character;
75 utf* utf_java_lang_Short;
76 utf* utf_java_lang_Integer;
77 utf* utf_java_lang_Long;
78 utf* utf_java_lang_Float;
79 utf* utf_java_lang_Double;
80
81 utf *utf_java_util_Vector;
82
83 utf *utf_InnerClasses;                  /* InnerClasses                       */
84 utf *utf_ConstantValue;                 /* ConstantValue                      */
85 utf *utf_Code;                          /* Code                               */
86 utf *utf_Exceptions;                    /* Exceptions                         */
87 utf *utf_LineNumberTable;               /* LineNumberTable                    */
88 utf *utf_SourceFile;                    /* SourceFile                         */
89
90 utf *utf_init;                          /* <init>                             */
91 utf *utf_clinit;                        /* <clinit>                           */
92 utf *utf_finalize;                      /* finalize                           */
93
94 utf *utf_printStackTrace;
95 utf *utf_fillInStackTrace;
96 utf *utf_loadClass;
97
98 utf *utf_void__void;                    /* ()V                                */
99 utf *utf_boolean__void;                 /* (Z)V                               */
100 utf *utf_byte__void;                    /* (B)V                               */
101 utf *utf_char__void;                    /* (C)V                               */
102 utf *utf_short__void;                   /* (S)V                               */
103 utf *utf_int__void;                     /* (I)V                               */
104 utf *utf_long__void;                    /* (J)V                               */
105 utf *utf_float__void;                   /* (F)V                               */
106 utf *utf_double__void;                  /* (D)V                               */
107 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
108 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
109 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
110 utf *utf_java_lang_String__java_lang_Class;
111 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
112
113 utf *array_packagename;
114
115
116 /* utf_init ********************************************************************
117
118    Initializes the utf8 subsystem.
119
120 *******************************************************************************/
121
122 void utf8_init(void)
123 {
124         /* create utf-symbols for pointer comparison of frequently used strings */
125
126         utf_java_lang_Object           = utf_new_char("java/lang/Object");
127
128         utf_java_lang_Class            = utf_new_char("java/lang/Class");
129         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
130         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
131         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
132         utf_java_lang_String           = utf_new_char("java/lang/String");
133         utf_java_lang_System           = utf_new_char("java/lang/System");
134         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
135
136         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
137         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
138         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
139         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
140
141         utf_java_lang_NoClassDefFoundError =
142                 utf_new_char(string_java_lang_NoClassDefFoundError);
143
144         utf_java_lang_OutOfMemoryError =
145                 utf_new_char(string_java_lang_OutOfMemoryError);
146
147         utf_java_lang_ClassNotFoundException =
148                 utf_new_char(string_java_lang_ClassNotFoundException);
149
150         utf_java_lang_Void             = utf_new_char("java/lang/Void");
151         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
152         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
153         utf_java_lang_Character        = utf_new_char("java/lang/Character");
154         utf_java_lang_Short            = utf_new_char("java/lang/Short");
155         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
156         utf_java_lang_Long             = utf_new_char("java/lang/Long");
157         utf_java_lang_Float            = utf_new_char("java/lang/Float");
158         utf_java_lang_Double           = utf_new_char("java/lang/Double");
159
160         utf_java_util_Vector           = utf_new_char("java/util/Vector");
161
162         utf_InnerClasses               = utf_new_char("InnerClasses");
163         utf_ConstantValue              = utf_new_char("ConstantValue");
164         utf_Code                       = utf_new_char("Code");
165         utf_Exceptions                 = utf_new_char("Exceptions");
166         utf_LineNumberTable            = utf_new_char("LineNumberTable");
167         utf_SourceFile                 = utf_new_char("SourceFile");
168
169         utf_init                           = utf_new_char("<init>");
170         utf_clinit                         = utf_new_char("<clinit>");
171         utf_finalize                   = utf_new_char("finalize");
172
173         utf_printStackTrace            = utf_new_char("printStackTrace");
174         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
175         utf_loadClass                  = utf_new_char("loadClass");
176
177         utf_void__void                 = utf_new_char("()V");
178         utf_boolean__void              = utf_new_char("(Z)V");
179         utf_byte__void                 = utf_new_char("(B)V");
180         utf_char__void                 = utf_new_char("(C)V");
181         utf_short__void                = utf_new_char("(S)V");
182         utf_int__void                  = utf_new_char("(I)V");
183         utf_long__void                 = utf_new_char("(J)V");
184         utf_float__void                = utf_new_char("(F)V");
185         utf_double__void               = utf_new_char("(D)V");
186         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
187         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
188         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
189
190         utf_java_lang_String__java_lang_Class =
191                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
192
193         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
194
195         array_packagename              = utf_new_char("\t<the array package>");
196 }
197
198
199 /* utf_hashkey *****************************************************************
200
201    The hashkey is computed from the utf-text by using up to 8
202    characters.  For utf-symbols longer than 15 characters 3 characters
203    are taken from the beginning and the end, 2 characters are taken
204    from the middle.
205
206 *******************************************************************************/
207
208 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
209 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
210
211 u4 utf_hashkey(const char *text, u4 length)
212 {
213         const char *start_pos = text;       /* pointer to utf text                */
214         u4 a;
215
216         switch (length) {
217         case 0: /* empty string */
218                 return 0;
219
220         case 1: return fbs(0);
221         case 2: return fbs(0) ^ nbs(3);
222         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
223         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
224         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
225         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
226         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
227         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
228
229         case 9:
230                 a = fbs(0);
231                 a ^= nbs(1);
232                 a ^= nbs(2);
233                 text++;
234                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
235
236         case 10:
237                 a = fbs(0);
238                 text++;
239                 a ^= nbs(2);
240                 a ^= nbs(3);
241                 a ^= nbs(4);
242                 text++;
243                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
244
245         case 11:
246                 a = fbs(0);
247                 text++;
248                 a ^= nbs(2);
249                 a ^= nbs(3);
250                 a ^= nbs(4);
251                 text++;
252                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
253
254         case 12:
255                 a = fbs(0);
256                 text += 2;
257                 a ^= nbs(2);
258                 a ^= nbs(3);
259                 text++;
260                 a ^= nbs(5);
261                 a ^= nbs(6);
262                 a ^= nbs(7);
263                 text++;
264                 return a ^ nbs(9) ^ nbs(10);
265
266         case 13:
267                 a = fbs(0);
268                 a ^= nbs(1);
269                 text++;
270                 a ^= nbs(3);
271                 a ^= nbs(4);
272                 text += 2;      
273                 a ^= nbs(7);
274                 a ^= nbs(8);
275                 text += 2;
276                 return a ^ nbs(9) ^ nbs(10);
277
278         case 14:
279                 a = fbs(0);
280                 text += 2;      
281                 a ^= nbs(3);
282                 a ^= nbs(4);
283                 text += 2;      
284                 a ^= nbs(7);
285                 a ^= nbs(8);
286                 text += 2;
287                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
288
289         case 15:
290                 a = fbs(0);
291                 text += 2;      
292                 a ^= nbs(3);
293                 a ^= nbs(4);
294                 text += 2;      
295                 a ^= nbs(7);
296                 a ^= nbs(8);
297                 text += 2;
298                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
299
300         default:  /* 3 characters from beginning */
301                 a = fbs(0);
302                 text += 2;
303                 a ^= nbs(3);
304                 a ^= nbs(4);
305
306                 /* 2 characters from middle */
307                 text = start_pos + (length / 2);
308                 a ^= fbs(5);
309                 text += 2;
310                 a ^= nbs(6);    
311
312                 /* 3 characters from end */
313                 text = start_pos + length - 4;
314
315                 a ^= fbs(7);
316                 text++;
317
318                 return a ^ nbs(10) ^ nbs(11);
319     }
320 }
321
322
323 /* utf_hashkey *****************************************************************
324
325    Compute the hashkey of a unicode string.
326
327 *******************************************************************************/
328
329 u4 unicode_hashkey(u2 *text, u2 len)
330 {
331         return utf_hashkey((char *) text, len);
332 }
333
334
335 /* utf_new *********************************************************************
336
337    Creates a new utf-symbol, the text of the symbol is passed as a
338    u1-array. The function searches the utf-hashtable for a utf-symbol
339    with this text. On success the element returned, otherwise a new
340    hashtable element is created.
341
342    If the number of entries in the hashtable exceeds twice the size of
343    the hashtable slots a reorganization of the hashtable is done and
344    the utf symbols are copied to a new hashtable with doubled size.
345
346 *******************************************************************************/
347
348 utf *utf_new_intern(const char *text, u2 length);
349
350 utf *utf_new(const char *text, u2 length)
351 {
352     utf *r;
353
354 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
355     tables_lock();
356 #endif
357
358     r = utf_new_intern(text, length);
359
360 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
361     tables_unlock();
362 #endif
363
364     return r;
365 }
366
367
368 utf *utf_new_intern(const char *text, u2 length)
369 {
370         u4 key;                             /* hashkey computed from utf-text     */
371         u4 slot;                            /* slot in hashtable                  */
372         utf *u;                             /* hashtable element                  */
373         u2 i;
374
375 #ifdef STATISTICS
376         if (opt_stat)
377                 count_utf_new++;
378 #endif
379
380         key  = utf_hashkey(text, length);
381         slot = key & (utf_hash.size - 1);
382         u    = utf_hash.ptr[slot];
383
384         /* search external hash chain for utf-symbol */
385         while (u) {
386                 if (u->blength == length) {
387
388                         /* compare text of hashtable elements */
389                         for (i = 0; i < length; i++)
390                                 if (text[i] != u->text[i]) goto nomatch;
391                         
392 #ifdef STATISTICS
393                         if (opt_stat)
394                                 count_utf_new_found++;
395 #endif
396
397                         /* symbol found in hashtable */
398                         return u;
399                 }
400         nomatch:
401                 u = u->hashlink; /* next element in external chain */
402         }
403
404 #ifdef STATISTICS
405         if (opt_stat)
406                 count_utf_len += sizeof(utf) + length;
407 #endif
408
409         /* location in hashtable found, create new utf element */
410         u = NEW(utf);
411         u->blength  = length;               /* length in bytes of utfstring       */
412         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
413         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
414         memcpy(u->text, text, length);      /* copy utf-text                      */
415         u->text[length] = '\0';
416         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
417
418         utf_hash.entries++;                 /* update number of entries           */
419
420         if (utf_hash.entries > (utf_hash.size * 2)) {
421
422         /* reorganization of hashtable, average length of 
423            the external chains is approx. 2                */  
424
425                 u4 i;
426                 utf *u;
427                 hashtable newhash; /* the new hashtable */
428
429                 /* create new hashtable, double the size */
430                 init_hashtable(&newhash, utf_hash.size * 2);
431                 newhash.entries = utf_hash.entries;
432
433 #ifdef STATISTICS
434                 if (opt_stat)
435                         count_utf_len += sizeof(utf*) * utf_hash.size;
436 #endif
437
438                 /* transfer elements to new hashtable */
439                 for (i = 0; i < utf_hash.size; i++) {
440                         u = (utf *) utf_hash.ptr[i];
441                         while (u) {
442                                 utf *nextu = u->hashlink;
443                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
444                                                 
445                                 u->hashlink = (utf *) newhash.ptr[slot];
446                                 newhash.ptr[slot] = u;
447
448                                 /* follow link in external hash chain */
449                                 u = nextu;
450                         }
451                 }
452         
453                 /* dispose old table */
454                 MFREE(utf_hash.ptr, void*, utf_hash.size);
455                 utf_hash = newhash;
456         }
457
458         return u;
459 }
460
461
462 /* utf_new_u2 ******************************************************************
463
464    Make utf symbol from u2 array, if isclassname is true '.' is
465    replaced by '/'.
466
467 *******************************************************************************/
468
469 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
470 {
471         char *buffer;                   /* memory buffer for  unicode characters  */
472         char *pos;                      /* pointer to current position in buffer  */
473         u4 left;                        /* unicode characters left                */
474         u4 buflength;                   /* utf length in bytes of the u2 array    */
475         utf *result;                    /* resulting utf-string                   */
476         int i;          
477
478         /* determine utf length in bytes and allocate memory */
479
480         buflength = u2_utflength(unicode_pos, unicode_length); 
481         buffer    = MNEW(char, buflength);
482  
483         left = buflength;
484         pos  = buffer;
485
486         for (i = 0; i++ < unicode_length; unicode_pos++) {
487                 /* next unicode character */
488                 u2 c = *unicode_pos;
489                 
490                 if ((c != 0) && (c < 0x80)) {
491                         /* 1 character */       
492                         left--;
493                 if ((int) left < 0) break;
494                         /* convert classname */
495                         if (isclassname && c == '.')
496                                 *pos++ = '/';
497                         else
498                                 *pos++ = (char) c;
499
500                 } else if (c < 0x800) {             
501                         /* 2 characters */                              
502                 unsigned char high = c >> 6;
503                 unsigned char low  = c & 0x3F;
504                         left = left - 2;
505                 if ((int) left < 0) break;
506                 *pos++ = high | 0xC0; 
507                 *pos++ = low  | 0x80;     
508
509                 } else {         
510                 /* 3 characters */                              
511                 char low  = c & 0x3f;
512                 char mid  = (c >> 6) & 0x3F;
513                 char high = c >> 12;
514                         left = left - 3;
515                 if ((int) left < 0) break;
516                 *pos++ = high | 0xE0; 
517                 *pos++ = mid  | 0x80;  
518                 *pos++ = low  | 0x80;   
519                 }
520         }
521         
522         /* insert utf-string into symbol-table */
523         result = utf_new(buffer,buflength);
524
525         MFREE(buffer, char, buflength);
526
527         return result;
528 }
529
530
531 /* utf_new_char ****************************************************************
532
533    Creates a new utf symbol, the text for this symbol is passed as a
534    c-string ( = char* ).
535
536 *******************************************************************************/
537
538 utf *utf_new_char(const char *text)
539 {
540         return utf_new(text, strlen(text));
541 }
542
543
544 /* utf_new_char_classname ******************************************************
545
546    Creates a new utf symbol, the text for this symbol is passed as a
547    c-string ( = char* ) "." characters are going to be replaced by
548    "/". Since the above function is used often, this is a separte
549    function, instead of an if.
550
551 *******************************************************************************/
552
553 utf *utf_new_char_classname(const char *text)
554 {
555         if (strchr(text, '.')) {
556                 char *txt = strdup(text);
557                 char *end = txt + strlen(txt);
558                 char *c;
559                 utf *tmpRes;
560
561                 for (c = txt; c < end; c++)
562                         if (*c == '.') *c = '/';
563
564                 tmpRes = utf_new(txt, strlen(txt));
565                 FREE(txt, 0);
566
567                 return tmpRes;
568
569         } else
570                 return utf_new(text, strlen(text));
571 }
572
573
574 /* utf_nextu2 ******************************************************************
575
576    Read the next unicode character from the utf string and increment
577    the utf-string pointer accordingly.
578
579 *******************************************************************************/
580
581 u2 utf_nextu2(char **utf_ptr)
582 {
583     /* uncompressed unicode character */
584     u2 unicode_char = 0;
585     /* current position in utf text */  
586     unsigned char *utf = (unsigned char *) (*utf_ptr);
587     /* bytes representing the unicode character */
588     unsigned char ch1, ch2, ch3;
589     /* number of bytes used to represent the unicode character */
590     int len = 0;
591         
592     switch ((ch1 = utf[0]) >> 4) {
593         default: /* 1 byte */
594                 (*utf_ptr)++;
595                 return (u2) ch1;
596         case 0xC: 
597         case 0xD: /* 2 bytes */
598                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
599                         unsigned char high = ch1 & 0x1F;
600                         unsigned char low  = ch2 & 0x3F;
601                         unicode_char = (high << 6) + low;
602                         len = 2;
603                 }
604                 break;
605
606         case 0xE: /* 2 or 3 bytes */
607                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
608                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
609                                 unsigned char low  = ch3 & 0x3f;
610                                 unsigned char mid  = ch2 & 0x3f;
611                                 unsigned char high = ch1 & 0x0f;
612                                 unicode_char = (((high << 6) + mid) << 6) + low;
613                                 len = 3;
614                         } else
615                                 len = 2;                                           
616                 }
617                 break;
618     }
619
620     /* update position in utf-text */
621     *utf_ptr = (char *) (utf + len);
622
623     return unicode_char;
624 }
625
626
627 /* utf_strlen ******************************************************************
628
629    Determine number of unicode characters in the utf string.
630
631 *******************************************************************************/
632
633 u4 utf_strlen(utf *u)
634 {
635         char *endpos;                       /* points behind utf string           */
636         char *utf_ptr;                      /* current position in utf text       */
637         u4 len = 0;                         /* number of unicode characters       */
638
639         if (!u) {
640                 *exceptionptr = new_nullpointerexception();
641                 return 0;
642         }
643
644         endpos = utf_end(u);
645         utf_ptr = u->text;
646
647         while (utf_ptr < endpos) {
648                 len++;
649                 /* next unicode character */
650                 utf_nextu2(&utf_ptr);
651         }
652
653         if (utf_ptr != endpos)
654                 /* string ended abruptly */
655                 throw_cacao_exception_exit(string_java_lang_InternalError,
656                                                                    "Illegal utf8 string");
657
658         return len;
659 }
660
661
662 /* u2_utflength ****************************************************************
663
664    Returns the utf length in bytes of a u2 array.
665
666 *******************************************************************************/
667
668 u4 u2_utflength(u2 *text, u4 u2_length)
669 {
670         u4 result_len = 0;                  /* utf length in bytes                */
671         u2 ch;                              /* current unicode character          */
672         u4 len;
673         
674         for (len = 0; len < u2_length; len++) {
675                 /* next unicode character */
676                 ch = *text++;
677           
678                 /* determine bytes required to store unicode character as utf */
679                 if (ch && (ch < 0x80)) 
680                         result_len++;
681                 else if (ch < 0x800)
682                         result_len += 2;        
683                 else 
684                         result_len += 3;        
685         }
686
687     return result_len;
688 }
689
690
691 /* utf_display *****************************************************************
692
693    Write utf symbol to stdout (for debugging purposes).
694
695 *******************************************************************************/
696
697 void utf_display(utf *u)
698 {
699         char *endpos;                       /* points behind utf string           */
700         char *utf_ptr;                      /* current position in utf text       */
701
702         if (!u) {
703                 printf("NULL");
704                 fflush(stdout);
705                 return;
706         }
707
708         endpos = utf_end(u);
709         utf_ptr = u->text;
710
711         while (utf_ptr < endpos) {
712                 /* read next unicode character */                
713                 u2 c = utf_nextu2(&utf_ptr);
714                 if (c >= 32 && c <= 127) printf("%c", c);
715                 else printf("?");
716         }
717
718         fflush(stdout);
719 }
720
721
722 /* utf_display_classname *******************************************************
723
724    Write utf symbol to stdout with `/' converted to `.' (for debugging
725    purposes).
726
727 *******************************************************************************/
728
729 void utf_display_classname(utf *u)
730 {
731         char *endpos;                       /* points behind utf string           */
732         char *utf_ptr;                      /* current position in utf text       */
733
734         if (!u) {
735                 printf("NULL");
736                 fflush(stdout);
737                 return;
738         }
739
740         endpos = utf_end(u);
741         utf_ptr = u->text;
742
743         while (utf_ptr < endpos) {
744                 /* read next unicode character */                
745                 u2 c = utf_nextu2(&utf_ptr);
746                 if (c == '/') c = '.';
747                 if (c >= 32 && c <= 127) printf("%c", c);
748                 else printf("?");
749         }
750
751         fflush(stdout);
752 }
753
754
755 /* utf_sprint ******************************************************************
756         
757    Write utf symbol into c-string (for debugging purposes).
758
759 *******************************************************************************/
760
761 void utf_sprint(char *buffer, utf *u)
762 {
763         char *endpos;                       /* points behind utf string           */
764         char *utf_ptr;                      /* current position in utf text       */
765         u2 pos = 0;                         /* position in c-string               */
766
767         if (!u) {
768                 strcpy(buffer, "NULL");
769                 return;
770         }
771
772         endpos = utf_end(u);
773         utf_ptr = u->text;
774
775         while (utf_ptr < endpos) 
776                 /* copy next unicode character */       
777                 buffer[pos++] = utf_nextu2(&utf_ptr);
778
779         /* terminate string */
780         buffer[pos] = '\0';
781 }
782
783
784 /* utf_sprint_classname ********************************************************
785         
786    Write utf symbol into c-string with `/' converted to `.' (for debugging
787    purposes).
788
789 *******************************************************************************/
790
791 void utf_sprint_classname(char *buffer, utf *u)
792 {
793         char *endpos;                       /* points behind utf string           */
794         char *utf_ptr;                      /* current position in utf text       */
795         u2 pos = 0;                         /* position in c-string               */
796
797         if (!u) {
798                 strcpy(buffer, "NULL");
799                 return;
800         }
801
802         endpos = utf_end(u);
803         utf_ptr = u->text;
804
805         while (utf_ptr < endpos) {
806                 /* copy next unicode character */       
807                 u2 c = utf_nextu2(&utf_ptr);
808                 if (c == '/') c = '.';
809                 buffer[pos++] = c;
810         }
811
812         /* terminate string */
813         buffer[pos] = '\0';
814 }
815
816
817 /* utf_strcat ******************************************************************
818         
819    Like libc strcat, but uses an utf8 string.
820
821 *******************************************************************************/
822
823 void utf_strcat(char *buffer, utf *u)
824 {
825         utf_sprint(buffer + strlen(buffer), u);
826 }
827
828
829 /* utf_strcat_classname ********************************************************
830         
831    Like libc strcat, but uses an utf8 string.
832
833 *******************************************************************************/
834
835 void utf_strcat_classname(char *buffer, utf *u)
836 {
837         utf_sprint_classname(buffer + strlen(buffer), u);
838 }
839
840
841 /* utf_fprint ******************************************************************
842         
843    Write utf symbol into file.
844
845 *******************************************************************************/
846
847 void utf_fprint(FILE *file, utf *u)
848 {
849         char *endpos;                       /* points behind utf string           */
850         char *utf_ptr;                      /* current position in utf text       */
851
852         if (!u)
853                 return;
854
855         endpos = utf_end(u);
856         utf_ptr = u->text;
857
858         while (utf_ptr < endpos) { 
859                 /* read next unicode character */                
860                 u2 c = utf_nextu2(&utf_ptr);                            
861
862                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
863                 else fprintf(file, "?");
864         }
865 }
866
867
868 /* utf_fprint_classname ********************************************************
869         
870    Write utf symbol into file with `/' converted to `.'.
871
872 *******************************************************************************/
873
874 void utf_fprint_classname(FILE *file, utf *u)
875 {
876         char *endpos;                       /* points behind utf string           */
877         char *utf_ptr;                      /* current position in utf text       */
878
879     if (!u)
880                 return;
881
882         endpos = utf_end(u);
883         utf_ptr = u->text;
884
885         while (utf_ptr < endpos) { 
886                 /* read next unicode character */                
887                 u2 c = utf_nextu2(&utf_ptr);                            
888                 if (c == '/') c = '.';
889
890                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
891                 else fprintf(file, "?");
892         }
893 }
894
895
896 /* is_valid_utf ****************************************************************
897
898    Return true if the given string is a valid UTF-8 string.
899
900    utf_ptr...points to first character
901    end_pos...points after last character
902
903 *******************************************************************************/
904
905 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
906
907 bool is_valid_utf(char *utf_ptr, char *end_pos)
908 {
909         int bytes;
910         int len,i;
911         char c;
912         unsigned long v;
913
914         if (end_pos < utf_ptr) return false;
915         bytes = end_pos - utf_ptr;
916         while (bytes--) {
917                 c = *utf_ptr++;
918
919                 if (!c) return false;                     /* 0x00 is not allowed */
920                 if ((c & 0x80) == 0) continue;            /* ASCII */
921
922                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
923                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
924                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
925                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
926                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
927                 else return false;                        /* invalid leading byte */
928
929                 if (len > 2) return false;                /* Java limitation */
930
931                 v = (unsigned long)c & (0x3f >> len);
932                 
933                 if ((bytes -= len) < 0) return false;     /* missing bytes */
934
935                 for (i = len; i--; ) {
936                         c = *utf_ptr++;
937                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
938                                 return false;
939                         v = (v << 6) | (c & 0x3f);
940                 }
941
942                 if (v == 0) {
943                         if (len != 1) return false;           /* Java special */
944
945                 } else {
946                         /* Sun Java seems to allow overlong UTF-8 encodings */
947                         
948                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
949                                 if (!opt_liberalutf)
950                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
951                                 /* XXX change this to panic? */
952                         }
953                 }
954
955                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
956                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
957
958                 /* even these seem to be allowed */
959                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
960         }
961
962         return true;
963 }
964
965
966 /* is_valid_name ***************************************************************
967
968    Return true if the given string may be used as a class/field/method
969    name. (Currently this only disallows empty strings and control
970    characters.)
971
972    NOTE: The string is assumed to have passed is_valid_utf!
973
974    utf_ptr...points to first character
975    end_pos...points after last character
976
977 *******************************************************************************/
978
979 bool is_valid_name(char *utf_ptr, char *end_pos)
980 {
981         if (end_pos <= utf_ptr) return false; /* disallow empty names */
982
983         while (utf_ptr < end_pos) {
984                 unsigned char c = *utf_ptr++;
985
986                 if (c < 0x20) return false; /* disallow control characters */
987                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
988                         return false;
989         }
990
991         return true;
992 }
993
994 bool is_valid_name_utf(utf *u)
995 {
996         return is_valid_name(u->text,utf_end(u));
997 }
998
999
1000 /* utf_show ********************************************************************
1001
1002    Writes the utf symbols in the utfhash to stdout and displays the
1003    number of external hash chains grouped according to the chainlength
1004    (for debugging purposes).
1005
1006 *******************************************************************************/
1007
1008 void utf_show(void)
1009 {
1010
1011 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1012
1013         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1014         u4 max_chainlength = 0;      /* maximum length of the chains */
1015         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1016         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1017         u4 i;
1018
1019         printf ("UTF-HASH:\n");
1020
1021         /* show element of utf-hashtable */
1022         for (i=0; i<utf_hash.size; i++) {
1023                 utf *u = utf_hash.ptr[i];
1024                 if (u) {
1025                         printf ("SLOT %d: ", (int) i);
1026                         while (u) {
1027                                 printf ("'");
1028                                 utf_display (u);
1029                                 printf ("' ");
1030                                 u = u->hashlink;
1031                         }       
1032                         printf ("\n");
1033                 }
1034                 
1035         }
1036
1037         printf ("UTF-HASH: %d slots for %d entries\n", 
1038                         (int) utf_hash.size, (int) utf_hash.entries );
1039
1040
1041         if (utf_hash.entries == 0)
1042                 return;
1043
1044         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1045
1046         for (i=0;i<CHAIN_LIMIT;i++)
1047                 chain_count[i]=0;
1048
1049         /* count numbers of hashchains according to their length */
1050         for (i=0; i<utf_hash.size; i++) {
1051                   
1052                 utf *u = (utf*) utf_hash.ptr[i];
1053                 u4 chain_length = 0;
1054
1055                 /* determine chainlength */
1056                 while (u) {
1057                         u = u->hashlink;
1058                         chain_length++;
1059                 }
1060
1061                 /* update sum of all chainlengths */
1062                 sum_chainlength+=chain_length;
1063
1064                 /* determine the maximum length of the chains */
1065                 if (chain_length>max_chainlength)
1066                         max_chainlength = chain_length;
1067
1068                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1069                 if (chain_length>=CHAIN_LIMIT) {
1070                         beyond_limit+=chain_length;
1071                         chain_length=CHAIN_LIMIT-1;
1072                 }
1073
1074                 /* update number of hashchains of current length */
1075                 chain_count[chain_length]++;
1076         }
1077
1078         /* display results */  
1079         for (i=1;i<CHAIN_LIMIT-1;i++) 
1080                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1081           
1082         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1083
1084
1085         printf("max. chainlength:%5d\n",max_chainlength);
1086
1087         /* avg. chainlength = sum of chainlengths / number of chains */
1088         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1089 }
1090
1091
1092 /*
1093  * These are local overrides for various environment variables in Emacs.
1094  * Please do not remove this and leave it at the end of the file, where
1095  * Emacs will automagically detect them.
1096  * ---------------------------------------------------------------------
1097  * Local variables:
1098  * mode: c
1099  * indent-tabs-mode: t
1100  * c-basic-offset: 4
1101  * tab-width: 4
1102  * End:
1103  */