d81446448ae70cbc083534a78c73736c0d787875
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2506 2005-05-23 08:32:38Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_Exception;
69 utf *utf_java_lang_NoClassDefFoundError;
70 utf *utf_java_lang_OutOfMemoryError;
71 utf *utf_java_lang_ClassNotFoundException;
72
73 utf* utf_java_lang_Void;
74 utf* utf_java_lang_Boolean;
75 utf* utf_java_lang_Byte;
76 utf* utf_java_lang_Character;
77 utf* utf_java_lang_Short;
78 utf* utf_java_lang_Integer;
79 utf* utf_java_lang_Long;
80 utf* utf_java_lang_Float;
81 utf* utf_java_lang_Double;
82
83 utf *utf_java_util_Vector;
84 utf *utf_java_lang_reflect_Constructor;
85 utf *utf_java_lang_reflect_Method;
86
87
88 utf *utf_InnerClasses;                  /* InnerClasses                       */
89 utf *utf_ConstantValue;                 /* ConstantValue                      */
90 utf *utf_Code;                          /* Code                               */
91 utf *utf_Exceptions;                    /* Exceptions                         */
92 utf *utf_LineNumberTable;               /* LineNumberTable                    */
93 utf *utf_SourceFile;                    /* SourceFile                         */
94
95 utf *utf_init;                          /* <init>                             */
96 utf *utf_clinit;                        /* <clinit>                           */
97 utf *utf_finalize;                      /* finalize                           */
98
99 utf *utf_printStackTrace;
100 utf *utf_fillInStackTrace;
101 utf *utf_loadClass;
102
103 utf *utf_void__void;                    /* ()V                                */
104 utf *utf_boolean__void;                 /* (Z)V                               */
105 utf *utf_byte__void;                    /* (B)V                               */
106 utf *utf_char__void;                    /* (C)V                               */
107 utf *utf_short__void;                   /* (S)V                               */
108 utf *utf_int__void;                     /* (I)V                               */
109 utf *utf_long__void;                    /* (J)V                               */
110 utf *utf_float__void;                   /* (F)V                               */
111 utf *utf_double__void;                  /* (D)V                               */
112 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
113 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
114 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
115 utf *utf_java_lang_String__java_lang_Class;
116 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
117
118 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
119
120 utf *array_packagename;
121
122
123 /* utf_init ********************************************************************
124
125    Initializes the utf8 subsystem.
126
127 *******************************************************************************/
128
129 void utf8_init(void)
130 {
131         /* create utf-symbols for pointer comparison of frequently used strings */
132
133         utf_java_lang_Object           = utf_new_char("java/lang/Object");
134
135         utf_java_lang_Class            = utf_new_char("java/lang/Class");
136         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
137         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
138         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
139         utf_java_lang_String           = utf_new_char("java/lang/String");
140         utf_java_lang_System           = utf_new_char("java/lang/System");
141         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
142         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
143
144         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
145         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
146         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
147         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
148
149         utf_java_lang_NoClassDefFoundError =
150                 utf_new_char(string_java_lang_NoClassDefFoundError);
151
152         utf_java_lang_OutOfMemoryError =
153                 utf_new_char(string_java_lang_OutOfMemoryError);
154
155         utf_java_lang_ClassNotFoundException =
156                 utf_new_char(string_java_lang_ClassNotFoundException);
157
158         utf_java_lang_Void             = utf_new_char("java/lang/Void");
159         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
160         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
161         utf_java_lang_Character        = utf_new_char("java/lang/Character");
162         utf_java_lang_Short            = utf_new_char("java/lang/Short");
163         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
164         utf_java_lang_Long             = utf_new_char("java/lang/Long");
165         utf_java_lang_Float            = utf_new_char("java/lang/Float");
166         utf_java_lang_Double           = utf_new_char("java/lang/Double");
167
168         utf_java_util_Vector           = utf_new_char("java/util/Vector");
169
170         utf_java_lang_reflect_Constructor =
171                 utf_new_char("java/lang/reflect/Constructor");
172
173         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
174
175         utf_InnerClasses               = utf_new_char("InnerClasses");
176         utf_ConstantValue              = utf_new_char("ConstantValue");
177         utf_Code                       = utf_new_char("Code");
178         utf_Exceptions                 = utf_new_char("Exceptions");
179         utf_LineNumberTable            = utf_new_char("LineNumberTable");
180         utf_SourceFile                 = utf_new_char("SourceFile");
181
182         utf_init                           = utf_new_char("<init>");
183         utf_clinit                         = utf_new_char("<clinit>");
184         utf_finalize                   = utf_new_char("finalize");
185
186         utf_printStackTrace            = utf_new_char("printStackTrace");
187         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
188         utf_loadClass                  = utf_new_char("loadClass");
189
190         utf_void__void                 = utf_new_char("()V");
191         utf_boolean__void              = utf_new_char("(Z)V");
192         utf_byte__void                 = utf_new_char("(B)V");
193         utf_char__void                 = utf_new_char("(C)V");
194         utf_short__void                = utf_new_char("(S)V");
195         utf_int__void                  = utf_new_char("(I)V");
196         utf_long__void                 = utf_new_char("(J)V");
197         utf_float__void                = utf_new_char("(F)V");
198         utf_double__void               = utf_new_char("(D)V");
199         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
200         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
201         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
202
203         utf_java_lang_String__java_lang_Class =
204                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
205
206         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
207
208         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
209
210         array_packagename              = utf_new_char("\t<the array package>");
211 }
212
213
214 /* utf_hashkey *****************************************************************
215
216    The hashkey is computed from the utf-text by using up to 8
217    characters.  For utf-symbols longer than 15 characters 3 characters
218    are taken from the beginning and the end, 2 characters are taken
219    from the middle.
220
221 *******************************************************************************/
222
223 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
224 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
225
226 u4 utf_hashkey(const char *text, u4 length)
227 {
228         const char *start_pos = text;       /* pointer to utf text                */
229         u4 a;
230
231         switch (length) {
232         case 0: /* empty string */
233                 return 0;
234
235         case 1: return fbs(0);
236         case 2: return fbs(0) ^ nbs(3);
237         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
238         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
239         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
240         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
241         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
242         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
243
244         case 9:
245                 a = fbs(0);
246                 a ^= nbs(1);
247                 a ^= nbs(2);
248                 text++;
249                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
250
251         case 10:
252                 a = fbs(0);
253                 text++;
254                 a ^= nbs(2);
255                 a ^= nbs(3);
256                 a ^= nbs(4);
257                 text++;
258                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
259
260         case 11:
261                 a = fbs(0);
262                 text++;
263                 a ^= nbs(2);
264                 a ^= nbs(3);
265                 a ^= nbs(4);
266                 text++;
267                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
268
269         case 12:
270                 a = fbs(0);
271                 text += 2;
272                 a ^= nbs(2);
273                 a ^= nbs(3);
274                 text++;
275                 a ^= nbs(5);
276                 a ^= nbs(6);
277                 a ^= nbs(7);
278                 text++;
279                 return a ^ nbs(9) ^ nbs(10);
280
281         case 13:
282                 a = fbs(0);
283                 a ^= nbs(1);
284                 text++;
285                 a ^= nbs(3);
286                 a ^= nbs(4);
287                 text += 2;      
288                 a ^= nbs(7);
289                 a ^= nbs(8);
290                 text += 2;
291                 return a ^ nbs(9) ^ nbs(10);
292
293         case 14:
294                 a = fbs(0);
295                 text += 2;      
296                 a ^= nbs(3);
297                 a ^= nbs(4);
298                 text += 2;      
299                 a ^= nbs(7);
300                 a ^= nbs(8);
301                 text += 2;
302                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
303
304         case 15:
305                 a = fbs(0);
306                 text += 2;      
307                 a ^= nbs(3);
308                 a ^= nbs(4);
309                 text += 2;      
310                 a ^= nbs(7);
311                 a ^= nbs(8);
312                 text += 2;
313                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
314
315         default:  /* 3 characters from beginning */
316                 a = fbs(0);
317                 text += 2;
318                 a ^= nbs(3);
319                 a ^= nbs(4);
320
321                 /* 2 characters from middle */
322                 text = start_pos + (length / 2);
323                 a ^= fbs(5);
324                 text += 2;
325                 a ^= nbs(6);    
326
327                 /* 3 characters from end */
328                 text = start_pos + length - 4;
329
330                 a ^= fbs(7);
331                 text++;
332
333                 return a ^ nbs(10) ^ nbs(11);
334     }
335 }
336
337
338 /* utf_hashkey *****************************************************************
339
340    Compute the hashkey of a unicode string.
341
342 *******************************************************************************/
343
344 u4 unicode_hashkey(u2 *text, u2 len)
345 {
346         return utf_hashkey((char *) text, len);
347 }
348
349
350 /* utf_new *********************************************************************
351
352    Creates a new utf-symbol, the text of the symbol is passed as a
353    u1-array. The function searches the utf-hashtable for a utf-symbol
354    with this text. On success the element returned, otherwise a new
355    hashtable element is created.
356
357    If the number of entries in the hashtable exceeds twice the size of
358    the hashtable slots a reorganization of the hashtable is done and
359    the utf symbols are copied to a new hashtable with doubled size.
360
361 *******************************************************************************/
362
363 utf *utf_new_intern(const char *text, u2 length);
364
365 utf *utf_new(const char *text, u2 length)
366 {
367     utf *r;
368
369 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
370     tables_lock();
371 #endif
372
373     r = utf_new_intern(text, length);
374
375 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
376     tables_unlock();
377 #endif
378
379     return r;
380 }
381
382
383 utf *utf_new_intern(const char *text, u2 length)
384 {
385         u4 key;                             /* hashkey computed from utf-text     */
386         u4 slot;                            /* slot in hashtable                  */
387         utf *u;                             /* hashtable element                  */
388         u2 i;
389
390 #ifdef STATISTICS
391         if (opt_stat)
392                 count_utf_new++;
393 #endif
394
395         key  = utf_hashkey(text, length);
396         slot = key & (utf_hash.size - 1);
397         u    = utf_hash.ptr[slot];
398
399         /* search external hash chain for utf-symbol */
400         while (u) {
401                 if (u->blength == length) {
402
403                         /* compare text of hashtable elements */
404                         for (i = 0; i < length; i++)
405                                 if (text[i] != u->text[i]) goto nomatch;
406                         
407 #ifdef STATISTICS
408                         if (opt_stat)
409                                 count_utf_new_found++;
410 #endif
411
412                         /* symbol found in hashtable */
413                         return u;
414                 }
415         nomatch:
416                 u = u->hashlink; /* next element in external chain */
417         }
418
419 #ifdef STATISTICS
420         if (opt_stat)
421                 count_utf_len += sizeof(utf) + length;
422 #endif
423
424         /* location in hashtable found, create new utf element */
425         u = NEW(utf);
426         u->blength  = length;               /* length in bytes of utfstring       */
427         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
428         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
429         memcpy(u->text, text, length);      /* copy utf-text                      */
430         u->text[length] = '\0';
431         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
432
433         utf_hash.entries++;                 /* update number of entries           */
434
435         if (utf_hash.entries > (utf_hash.size * 2)) {
436
437         /* reorganization of hashtable, average length of 
438            the external chains is approx. 2                */  
439
440                 u4 i;
441                 utf *u;
442                 hashtable newhash; /* the new hashtable */
443
444                 /* create new hashtable, double the size */
445                 init_hashtable(&newhash, utf_hash.size * 2);
446                 newhash.entries = utf_hash.entries;
447
448 #ifdef STATISTICS
449                 if (opt_stat)
450                         count_utf_len += sizeof(utf*) * utf_hash.size;
451 #endif
452
453                 /* transfer elements to new hashtable */
454                 for (i = 0; i < utf_hash.size; i++) {
455                         u = (utf *) utf_hash.ptr[i];
456                         while (u) {
457                                 utf *nextu = u->hashlink;
458                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
459                                                 
460                                 u->hashlink = (utf *) newhash.ptr[slot];
461                                 newhash.ptr[slot] = u;
462
463                                 /* follow link in external hash chain */
464                                 u = nextu;
465                         }
466                 }
467         
468                 /* dispose old table */
469                 MFREE(utf_hash.ptr, void*, utf_hash.size);
470                 utf_hash = newhash;
471         }
472
473         return u;
474 }
475
476
477 /* utf_new_u2 ******************************************************************
478
479    Make utf symbol from u2 array, if isclassname is true '.' is
480    replaced by '/'.
481
482 *******************************************************************************/
483
484 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
485 {
486         char *buffer;                   /* memory buffer for  unicode characters  */
487         char *pos;                      /* pointer to current position in buffer  */
488         u4 left;                        /* unicode characters left                */
489         u4 buflength;                   /* utf length in bytes of the u2 array    */
490         utf *result;                    /* resulting utf-string                   */
491         int i;          
492
493         /* determine utf length in bytes and allocate memory */
494
495         buflength = u2_utflength(unicode_pos, unicode_length); 
496         buffer    = MNEW(char, buflength);
497  
498         left = buflength;
499         pos  = buffer;
500
501         for (i = 0; i++ < unicode_length; unicode_pos++) {
502                 /* next unicode character */
503                 u2 c = *unicode_pos;
504                 
505                 if ((c != 0) && (c < 0x80)) {
506                         /* 1 character */       
507                         left--;
508                 if ((int) left < 0) break;
509                         /* convert classname */
510                         if (isclassname && c == '.')
511                                 *pos++ = '/';
512                         else
513                                 *pos++ = (char) c;
514
515                 } else if (c < 0x800) {             
516                         /* 2 characters */                              
517                 unsigned char high = c >> 6;
518                 unsigned char low  = c & 0x3F;
519                         left = left - 2;
520                 if ((int) left < 0) break;
521                 *pos++ = high | 0xC0; 
522                 *pos++ = low  | 0x80;     
523
524                 } else {         
525                 /* 3 characters */                              
526                 char low  = c & 0x3f;
527                 char mid  = (c >> 6) & 0x3F;
528                 char high = c >> 12;
529                         left = left - 3;
530                 if ((int) left < 0) break;
531                 *pos++ = high | 0xE0; 
532                 *pos++ = mid  | 0x80;  
533                 *pos++ = low  | 0x80;   
534                 }
535         }
536         
537         /* insert utf-string into symbol-table */
538         result = utf_new(buffer,buflength);
539
540         MFREE(buffer, char, buflength);
541
542         return result;
543 }
544
545
546 /* utf_new_char ****************************************************************
547
548    Creates a new utf symbol, the text for this symbol is passed as a
549    c-string ( = char* ).
550
551 *******************************************************************************/
552
553 utf *utf_new_char(const char *text)
554 {
555         return utf_new(text, strlen(text));
556 }
557
558
559 /* utf_new_char_classname ******************************************************
560
561    Creates a new utf symbol, the text for this symbol is passed as a
562    c-string ( = char* ) "." characters are going to be replaced by
563    "/". Since the above function is used often, this is a separte
564    function, instead of an if.
565
566 *******************************************************************************/
567
568 utf *utf_new_char_classname(const char *text)
569 {
570         if (strchr(text, '.')) {
571                 char *txt = strdup(text);
572                 char *end = txt + strlen(txt);
573                 char *c;
574                 utf *tmpRes;
575
576                 for (c = txt; c < end; c++)
577                         if (*c == '.') *c = '/';
578
579                 tmpRes = utf_new(txt, strlen(txt));
580                 FREE(txt, 0);
581
582                 return tmpRes;
583
584         } else
585                 return utf_new(text, strlen(text));
586 }
587
588
589 /* utf_nextu2 ******************************************************************
590
591    Read the next unicode character from the utf string and increment
592    the utf-string pointer accordingly.
593
594 *******************************************************************************/
595
596 u2 utf_nextu2(char **utf_ptr)
597 {
598     /* uncompressed unicode character */
599     u2 unicode_char = 0;
600     /* current position in utf text */  
601     unsigned char *utf = (unsigned char *) (*utf_ptr);
602     /* bytes representing the unicode character */
603     unsigned char ch1, ch2, ch3;
604     /* number of bytes used to represent the unicode character */
605     int len = 0;
606         
607     switch ((ch1 = utf[0]) >> 4) {
608         default: /* 1 byte */
609                 (*utf_ptr)++;
610                 return (u2) ch1;
611         case 0xC: 
612         case 0xD: /* 2 bytes */
613                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
614                         unsigned char high = ch1 & 0x1F;
615                         unsigned char low  = ch2 & 0x3F;
616                         unicode_char = (high << 6) + low;
617                         len = 2;
618                 }
619                 break;
620
621         case 0xE: /* 2 or 3 bytes */
622                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
623                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
624                                 unsigned char low  = ch3 & 0x3f;
625                                 unsigned char mid  = ch2 & 0x3f;
626                                 unsigned char high = ch1 & 0x0f;
627                                 unicode_char = (((high << 6) + mid) << 6) + low;
628                                 len = 3;
629                         } else
630                                 len = 2;                                           
631                 }
632                 break;
633     }
634
635     /* update position in utf-text */
636     *utf_ptr = (char *) (utf + len);
637
638     return unicode_char;
639 }
640
641
642 /* utf_strlen ******************************************************************
643
644    Determine number of unicode characters in the utf string.
645
646 *******************************************************************************/
647
648 u4 utf_strlen(utf *u)
649 {
650         char *endpos;                       /* points behind utf string           */
651         char *utf_ptr;                      /* current position in utf text       */
652         u4 len = 0;                         /* number of unicode characters       */
653
654         if (!u) {
655                 *exceptionptr = new_nullpointerexception();
656                 return 0;
657         }
658
659         endpos = utf_end(u);
660         utf_ptr = u->text;
661
662         while (utf_ptr < endpos) {
663                 len++;
664                 /* next unicode character */
665                 utf_nextu2(&utf_ptr);
666         }
667
668         if (utf_ptr != endpos)
669                 /* string ended abruptly */
670                 throw_cacao_exception_exit(string_java_lang_InternalError,
671                                                                    "Illegal utf8 string");
672
673         return len;
674 }
675
676
677 /* u2_utflength ****************************************************************
678
679    Returns the utf length in bytes of a u2 array.
680
681 *******************************************************************************/
682
683 u4 u2_utflength(u2 *text, u4 u2_length)
684 {
685         u4 result_len = 0;                  /* utf length in bytes                */
686         u2 ch;                              /* current unicode character          */
687         u4 len;
688         
689         for (len = 0; len < u2_length; len++) {
690                 /* next unicode character */
691                 ch = *text++;
692           
693                 /* determine bytes required to store unicode character as utf */
694                 if (ch && (ch < 0x80)) 
695                         result_len++;
696                 else if (ch < 0x800)
697                         result_len += 2;        
698                 else 
699                         result_len += 3;        
700         }
701
702     return result_len;
703 }
704
705
706 /* utf_display *****************************************************************
707
708    Write utf symbol to stdout (for debugging purposes).
709
710 *******************************************************************************/
711
712 void utf_display(utf *u)
713 {
714         char *endpos;                       /* points behind utf string           */
715         char *utf_ptr;                      /* current position in utf text       */
716
717         if (!u) {
718                 printf("NULL");
719                 fflush(stdout);
720                 return;
721         }
722
723         endpos = utf_end(u);
724         utf_ptr = u->text;
725
726         while (utf_ptr < endpos) {
727                 /* read next unicode character */                
728                 u2 c = utf_nextu2(&utf_ptr);
729                 if (c >= 32 && c <= 127) printf("%c", c);
730                 else printf("?");
731         }
732
733         fflush(stdout);
734 }
735
736
737 /* utf_display_classname *******************************************************
738
739    Write utf symbol to stdout with `/' converted to `.' (for debugging
740    purposes).
741
742 *******************************************************************************/
743
744 void utf_display_classname(utf *u)
745 {
746         char *endpos;                       /* points behind utf string           */
747         char *utf_ptr;                      /* current position in utf text       */
748
749         if (!u) {
750                 printf("NULL");
751                 fflush(stdout);
752                 return;
753         }
754
755         endpos = utf_end(u);
756         utf_ptr = u->text;
757
758         while (utf_ptr < endpos) {
759                 /* read next unicode character */                
760                 u2 c = utf_nextu2(&utf_ptr);
761                 if (c == '/') c = '.';
762                 if (c >= 32 && c <= 127) printf("%c", c);
763                 else printf("?");
764         }
765
766         fflush(stdout);
767 }
768
769
770 /* utf_sprint ******************************************************************
771         
772    Write utf symbol into c-string (for debugging purposes).
773
774 *******************************************************************************/
775
776 void utf_sprint(char *buffer, utf *u)
777 {
778         char *endpos;                       /* points behind utf string           */
779         char *utf_ptr;                      /* current position in utf text       */
780         u2 pos = 0;                         /* position in c-string               */
781
782         if (!u) {
783                 strcpy(buffer, "NULL");
784                 return;
785         }
786
787         endpos = utf_end(u);
788         utf_ptr = u->text;
789
790         while (utf_ptr < endpos) 
791                 /* copy next unicode character */       
792                 buffer[pos++] = utf_nextu2(&utf_ptr);
793
794         /* terminate string */
795         buffer[pos] = '\0';
796 }
797
798
799 /* utf_sprint_classname ********************************************************
800         
801    Write utf symbol into c-string with `/' converted to `.' (for debugging
802    purposes).
803
804 *******************************************************************************/
805
806 void utf_sprint_classname(char *buffer, utf *u)
807 {
808         char *endpos;                       /* points behind utf string           */
809         char *utf_ptr;                      /* current position in utf text       */
810         u2 pos = 0;                         /* position in c-string               */
811
812         if (!u) {
813                 strcpy(buffer, "NULL");
814                 return;
815         }
816
817         endpos = utf_end(u);
818         utf_ptr = u->text;
819
820         while (utf_ptr < endpos) {
821                 /* copy next unicode character */       
822                 u2 c = utf_nextu2(&utf_ptr);
823                 if (c == '/') c = '.';
824                 buffer[pos++] = c;
825         }
826
827         /* terminate string */
828         buffer[pos] = '\0';
829 }
830
831
832 /* utf_strcat ******************************************************************
833         
834    Like libc strcat, but uses an utf8 string.
835
836 *******************************************************************************/
837
838 void utf_strcat(char *buffer, utf *u)
839 {
840         utf_sprint(buffer + strlen(buffer), u);
841 }
842
843
844 /* utf_strcat_classname ********************************************************
845         
846    Like libc strcat, but uses an utf8 string.
847
848 *******************************************************************************/
849
850 void utf_strcat_classname(char *buffer, utf *u)
851 {
852         utf_sprint_classname(buffer + strlen(buffer), u);
853 }
854
855
856 /* utf_fprint ******************************************************************
857         
858    Write utf symbol into file.
859
860 *******************************************************************************/
861
862 void utf_fprint(FILE *file, utf *u)
863 {
864         char *endpos;                       /* points behind utf string           */
865         char *utf_ptr;                      /* current position in utf text       */
866
867         if (!u)
868                 return;
869
870         endpos = utf_end(u);
871         utf_ptr = u->text;
872
873         while (utf_ptr < endpos) { 
874                 /* read next unicode character */                
875                 u2 c = utf_nextu2(&utf_ptr);                            
876
877                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
878                 else fprintf(file, "?");
879         }
880 }
881
882
883 /* utf_fprint_classname ********************************************************
884         
885    Write utf symbol into file with `/' converted to `.'.
886
887 *******************************************************************************/
888
889 void utf_fprint_classname(FILE *file, utf *u)
890 {
891         char *endpos;                       /* points behind utf string           */
892         char *utf_ptr;                      /* current position in utf text       */
893
894     if (!u)
895                 return;
896
897         endpos = utf_end(u);
898         utf_ptr = u->text;
899
900         while (utf_ptr < endpos) { 
901                 /* read next unicode character */                
902                 u2 c = utf_nextu2(&utf_ptr);                            
903                 if (c == '/') c = '.';
904
905                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
906                 else fprintf(file, "?");
907         }
908 }
909
910
911 /* is_valid_utf ****************************************************************
912
913    Return true if the given string is a valid UTF-8 string.
914
915    utf_ptr...points to first character
916    end_pos...points after last character
917
918 *******************************************************************************/
919
920 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
921
922 bool is_valid_utf(char *utf_ptr, char *end_pos)
923 {
924         int bytes;
925         int len,i;
926         char c;
927         unsigned long v;
928
929         if (end_pos < utf_ptr) return false;
930         bytes = end_pos - utf_ptr;
931         while (bytes--) {
932                 c = *utf_ptr++;
933
934                 if (!c) return false;                     /* 0x00 is not allowed */
935                 if ((c & 0x80) == 0) continue;            /* ASCII */
936
937                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
938                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
939                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
940                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
941                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
942                 else return false;                        /* invalid leading byte */
943
944                 if (len > 2) return false;                /* Java limitation */
945
946                 v = (unsigned long)c & (0x3f >> len);
947                 
948                 if ((bytes -= len) < 0) return false;     /* missing bytes */
949
950                 for (i = len; i--; ) {
951                         c = *utf_ptr++;
952                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
953                                 return false;
954                         v = (v << 6) | (c & 0x3f);
955                 }
956
957                 if (v == 0) {
958                         if (len != 1) return false;           /* Java special */
959
960                 } else {
961                         /* Sun Java seems to allow overlong UTF-8 encodings */
962                         
963                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
964                                 if (!opt_liberalutf)
965                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
966                                 /* XXX change this to exception? */
967                         }
968                 }
969
970                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
971                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
972
973                 /* even these seem to be allowed */
974                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
975         }
976
977         return true;
978 }
979
980
981 /* is_valid_name ***************************************************************
982
983    Return true if the given string may be used as a class/field/method
984    name. (Currently this only disallows empty strings and control
985    characters.)
986
987    NOTE: The string is assumed to have passed is_valid_utf!
988
989    utf_ptr...points to first character
990    end_pos...points after last character
991
992 *******************************************************************************/
993
994 bool is_valid_name(char *utf_ptr, char *end_pos)
995 {
996         if (end_pos <= utf_ptr) return false; /* disallow empty names */
997
998         while (utf_ptr < end_pos) {
999                 unsigned char c = *utf_ptr++;
1000
1001                 if (c < 0x20) return false; /* disallow control characters */
1002                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1003                         return false;
1004         }
1005
1006         return true;
1007 }
1008
1009 bool is_valid_name_utf(utf *u)
1010 {
1011         return is_valid_name(u->text,utf_end(u));
1012 }
1013
1014
1015 /* utf_show ********************************************************************
1016
1017    Writes the utf symbols in the utfhash to stdout and displays the
1018    number of external hash chains grouped according to the chainlength
1019    (for debugging purposes).
1020
1021 *******************************************************************************/
1022
1023 void utf_show(void)
1024 {
1025
1026 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1027
1028         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1029         u4 max_chainlength = 0;      /* maximum length of the chains */
1030         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1031         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1032         u4 i;
1033
1034         printf ("UTF-HASH:\n");
1035
1036         /* show element of utf-hashtable */
1037         for (i=0; i<utf_hash.size; i++) {
1038                 utf *u = utf_hash.ptr[i];
1039                 if (u) {
1040                         printf ("SLOT %d: ", (int) i);
1041                         while (u) {
1042                                 printf ("'");
1043                                 utf_display (u);
1044                                 printf ("' ");
1045                                 u = u->hashlink;
1046                         }       
1047                         printf ("\n");
1048                 }
1049                 
1050         }
1051
1052         printf ("UTF-HASH: %d slots for %d entries\n", 
1053                         (int) utf_hash.size, (int) utf_hash.entries );
1054
1055
1056         if (utf_hash.entries == 0)
1057                 return;
1058
1059         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1060
1061         for (i=0;i<CHAIN_LIMIT;i++)
1062                 chain_count[i]=0;
1063
1064         /* count numbers of hashchains according to their length */
1065         for (i=0; i<utf_hash.size; i++) {
1066                   
1067                 utf *u = (utf*) utf_hash.ptr[i];
1068                 u4 chain_length = 0;
1069
1070                 /* determine chainlength */
1071                 while (u) {
1072                         u = u->hashlink;
1073                         chain_length++;
1074                 }
1075
1076                 /* update sum of all chainlengths */
1077                 sum_chainlength+=chain_length;
1078
1079                 /* determine the maximum length of the chains */
1080                 if (chain_length>max_chainlength)
1081                         max_chainlength = chain_length;
1082
1083                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1084                 if (chain_length>=CHAIN_LIMIT) {
1085                         beyond_limit+=chain_length;
1086                         chain_length=CHAIN_LIMIT-1;
1087                 }
1088
1089                 /* update number of hashchains of current length */
1090                 chain_count[chain_length]++;
1091         }
1092
1093         /* display results */  
1094         for (i=1;i<CHAIN_LIMIT-1;i++) 
1095                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1096           
1097         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1098
1099
1100         printf("max. chainlength:%5d\n",max_chainlength);
1101
1102         /* avg. chainlength = sum of chainlengths / number of chains */
1103         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1104 }
1105
1106
1107 /*
1108  * These are local overrides for various environment variables in Emacs.
1109  * Please do not remove this and leave it at the end of the file, where
1110  * Emacs will automagically detect them.
1111  * ---------------------------------------------------------------------
1112  * Local variables:
1113  * mode: c
1114  * indent-tabs-mode: t
1115  * c-basic-offset: 4
1116  * tab-width: 4
1117  * End:
1118  */