Moved global string definitions to string.c for 2 reasons: it seems to be
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2458 2005-05-12 23:02:07Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/stringlocal.h"
45 #include "vm/tables.h"
46 #include "vm/utf8.h"
47
48
49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
50
51
52 /* utf-symbols for pointer comparison of frequently used strings **************/
53
54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
55
56 utf *utf_java_lang_Class;
57 utf *utf_java_lang_ClassLoader;
58 utf *utf_java_lang_Cloneable;
59 utf *utf_java_lang_SecurityManager;
60 utf *utf_java_lang_String;
61 utf *utf_java_lang_System;
62 utf *utf_java_lang_ThreadGroup;
63 utf *utf_java_io_Serializable;
64
65 utf *utf_java_lang_Throwable;
66 utf *utf_java_lang_VMThrowable;
67 utf *utf_java_lang_Error;
68 utf *utf_java_lang_Exception;
69 utf *utf_java_lang_NoClassDefFoundError;
70 utf *utf_java_lang_OutOfMemoryError;
71 utf *utf_java_lang_ClassNotFoundException;
72
73 utf* utf_java_lang_Void;
74 utf* utf_java_lang_Boolean;
75 utf* utf_java_lang_Byte;
76 utf* utf_java_lang_Character;
77 utf* utf_java_lang_Short;
78 utf* utf_java_lang_Integer;
79 utf* utf_java_lang_Long;
80 utf* utf_java_lang_Float;
81 utf* utf_java_lang_Double;
82
83 utf *utf_java_util_Vector;
84 utf *utf_java_lang_reflect_Constructor;
85 utf *utf_java_lang_reflect_Method;
86
87
88 utf *utf_InnerClasses;                  /* InnerClasses                       */
89 utf *utf_ConstantValue;                 /* ConstantValue                      */
90 utf *utf_Code;                          /* Code                               */
91 utf *utf_Exceptions;                    /* Exceptions                         */
92 utf *utf_LineNumberTable;               /* LineNumberTable                    */
93 utf *utf_SourceFile;                    /* SourceFile                         */
94
95 utf *utf_init;                          /* <init>                             */
96 utf *utf_clinit;                        /* <clinit>                           */
97 utf *utf_finalize;                      /* finalize                           */
98
99 utf *utf_printStackTrace;
100 utf *utf_fillInStackTrace;
101 utf *utf_loadClass;
102
103 utf *utf_void__void;                    /* ()V                                */
104 utf *utf_boolean__void;                 /* (Z)V                               */
105 utf *utf_byte__void;                    /* (B)V                               */
106 utf *utf_char__void;                    /* (C)V                               */
107 utf *utf_short__void;                   /* (S)V                               */
108 utf *utf_int__void;                     /* (I)V                               */
109 utf *utf_long__void;                    /* (J)V                               */
110 utf *utf_float__void;                   /* (F)V                               */
111 utf *utf_double__void;                  /* (D)V                               */
112 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
113 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
114 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
115 utf *utf_java_lang_String__java_lang_Class;
116 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
117
118 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
119
120 utf *array_packagename;
121
122
123 /* utf_init ********************************************************************
124
125    Initializes the utf8 subsystem.
126
127 *******************************************************************************/
128
129 void utf8_init(void)
130 {
131         /* create utf-symbols for pointer comparison of frequently used strings */
132
133         utf_java_lang_Object           = utf_new_char("java/lang/Object");
134
135         utf_java_lang_Class            = utf_new_char("java/lang/Class");
136         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
137         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
138         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
139         utf_java_lang_String           = utf_new_char("java/lang/String");
140         utf_java_lang_System           = utf_new_char("java/lang/System");
141         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
142         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
143
144         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
145         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
146         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
147         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
148
149         utf_java_lang_NoClassDefFoundError =
150                 utf_new_char(string_java_lang_NoClassDefFoundError);
151
152         utf_java_lang_OutOfMemoryError =
153                 utf_new_char(string_java_lang_OutOfMemoryError);
154
155         utf_java_lang_ClassNotFoundException =
156                 utf_new_char(string_java_lang_ClassNotFoundException);
157
158         utf_java_lang_Void             = utf_new_char("java/lang/Void");
159         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
160         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
161         utf_java_lang_Character        = utf_new_char("java/lang/Character");
162         utf_java_lang_Short            = utf_new_char("java/lang/Short");
163         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
164         utf_java_lang_Long             = utf_new_char("java/lang/Long");
165         utf_java_lang_Float            = utf_new_char("java/lang/Float");
166         utf_java_lang_Double           = utf_new_char("java/lang/Double");
167
168         utf_java_util_Vector           = utf_new_char("java/util/Vector");
169         utf_java_lang_reflect_Constructor = utf_new_char("java/lang/reflect/Constructor");
170         utf_java_lang_reflect_Method      = utf_new_char("java/lang/reflect/Method");
171
172         utf_InnerClasses               = utf_new_char("InnerClasses");
173         utf_ConstantValue              = utf_new_char("ConstantValue");
174         utf_Code                       = utf_new_char("Code");
175         utf_Exceptions                 = utf_new_char("Exceptions");
176         utf_LineNumberTable            = utf_new_char("LineNumberTable");
177         utf_SourceFile                 = utf_new_char("SourceFile");
178
179         utf_init                           = utf_new_char("<init>");
180         utf_clinit                         = utf_new_char("<clinit>");
181         utf_finalize                   = utf_new_char("finalize");
182
183         utf_printStackTrace            = utf_new_char("printStackTrace");
184         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
185         utf_loadClass                  = utf_new_char("loadClass");
186
187         utf_void__void                 = utf_new_char("()V");
188         utf_boolean__void              = utf_new_char("(Z)V");
189         utf_byte__void                 = utf_new_char("(B)V");
190         utf_char__void                 = utf_new_char("(C)V");
191         utf_short__void                = utf_new_char("(S)V");
192         utf_int__void                  = utf_new_char("(I)V");
193         utf_long__void                 = utf_new_char("(J)V");
194         utf_float__void                = utf_new_char("(F)V");
195         utf_double__void               = utf_new_char("(D)V");
196         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
197         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
198         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
199
200         utf_java_lang_String__java_lang_Class =
201                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
202
203         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
204
205         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
206
207         array_packagename              = utf_new_char("\t<the array package>");
208 }
209
210
211 /* utf_hashkey *****************************************************************
212
213    The hashkey is computed from the utf-text by using up to 8
214    characters.  For utf-symbols longer than 15 characters 3 characters
215    are taken from the beginning and the end, 2 characters are taken
216    from the middle.
217
218 *******************************************************************************/
219
220 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
221 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
222
223 u4 utf_hashkey(const char *text, u4 length)
224 {
225         const char *start_pos = text;       /* pointer to utf text                */
226         u4 a;
227
228         switch (length) {
229         case 0: /* empty string */
230                 return 0;
231
232         case 1: return fbs(0);
233         case 2: return fbs(0) ^ nbs(3);
234         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
235         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
236         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
237         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
238         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
239         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
240
241         case 9:
242                 a = fbs(0);
243                 a ^= nbs(1);
244                 a ^= nbs(2);
245                 text++;
246                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
247
248         case 10:
249                 a = fbs(0);
250                 text++;
251                 a ^= nbs(2);
252                 a ^= nbs(3);
253                 a ^= nbs(4);
254                 text++;
255                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
256
257         case 11:
258                 a = fbs(0);
259                 text++;
260                 a ^= nbs(2);
261                 a ^= nbs(3);
262                 a ^= nbs(4);
263                 text++;
264                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
265
266         case 12:
267                 a = fbs(0);
268                 text += 2;
269                 a ^= nbs(2);
270                 a ^= nbs(3);
271                 text++;
272                 a ^= nbs(5);
273                 a ^= nbs(6);
274                 a ^= nbs(7);
275                 text++;
276                 return a ^ nbs(9) ^ nbs(10);
277
278         case 13:
279                 a = fbs(0);
280                 a ^= nbs(1);
281                 text++;
282                 a ^= nbs(3);
283                 a ^= nbs(4);
284                 text += 2;      
285                 a ^= nbs(7);
286                 a ^= nbs(8);
287                 text += 2;
288                 return a ^ nbs(9) ^ nbs(10);
289
290         case 14:
291                 a = fbs(0);
292                 text += 2;      
293                 a ^= nbs(3);
294                 a ^= nbs(4);
295                 text += 2;      
296                 a ^= nbs(7);
297                 a ^= nbs(8);
298                 text += 2;
299                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
300
301         case 15:
302                 a = fbs(0);
303                 text += 2;      
304                 a ^= nbs(3);
305                 a ^= nbs(4);
306                 text += 2;      
307                 a ^= nbs(7);
308                 a ^= nbs(8);
309                 text += 2;
310                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
311
312         default:  /* 3 characters from beginning */
313                 a = fbs(0);
314                 text += 2;
315                 a ^= nbs(3);
316                 a ^= nbs(4);
317
318                 /* 2 characters from middle */
319                 text = start_pos + (length / 2);
320                 a ^= fbs(5);
321                 text += 2;
322                 a ^= nbs(6);    
323
324                 /* 3 characters from end */
325                 text = start_pos + length - 4;
326
327                 a ^= fbs(7);
328                 text++;
329
330                 return a ^ nbs(10) ^ nbs(11);
331     }
332 }
333
334
335 /* utf_hashkey *****************************************************************
336
337    Compute the hashkey of a unicode string.
338
339 *******************************************************************************/
340
341 u4 unicode_hashkey(u2 *text, u2 len)
342 {
343         return utf_hashkey((char *) text, len);
344 }
345
346
347 /* utf_new *********************************************************************
348
349    Creates a new utf-symbol, the text of the symbol is passed as a
350    u1-array. The function searches the utf-hashtable for a utf-symbol
351    with this text. On success the element returned, otherwise a new
352    hashtable element is created.
353
354    If the number of entries in the hashtable exceeds twice the size of
355    the hashtable slots a reorganization of the hashtable is done and
356    the utf symbols are copied to a new hashtable with doubled size.
357
358 *******************************************************************************/
359
360 utf *utf_new_intern(const char *text, u2 length);
361
362 utf *utf_new(const char *text, u2 length)
363 {
364     utf *r;
365
366 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
367     tables_lock();
368 #endif
369
370     r = utf_new_intern(text, length);
371
372 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
373     tables_unlock();
374 #endif
375
376     return r;
377 }
378
379
380 utf *utf_new_intern(const char *text, u2 length)
381 {
382         u4 key;                             /* hashkey computed from utf-text     */
383         u4 slot;                            /* slot in hashtable                  */
384         utf *u;                             /* hashtable element                  */
385         u2 i;
386
387 #ifdef STATISTICS
388         if (opt_stat)
389                 count_utf_new++;
390 #endif
391
392         key  = utf_hashkey(text, length);
393         slot = key & (utf_hash.size - 1);
394         u    = utf_hash.ptr[slot];
395
396         /* search external hash chain for utf-symbol */
397         while (u) {
398                 if (u->blength == length) {
399
400                         /* compare text of hashtable elements */
401                         for (i = 0; i < length; i++)
402                                 if (text[i] != u->text[i]) goto nomatch;
403                         
404 #ifdef STATISTICS
405                         if (opt_stat)
406                                 count_utf_new_found++;
407 #endif
408
409                         /* symbol found in hashtable */
410                         return u;
411                 }
412         nomatch:
413                 u = u->hashlink; /* next element in external chain */
414         }
415
416 #ifdef STATISTICS
417         if (opt_stat)
418                 count_utf_len += sizeof(utf) + length;
419 #endif
420
421         /* location in hashtable found, create new utf element */
422         u = NEW(utf);
423         u->blength  = length;               /* length in bytes of utfstring       */
424         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
425         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
426         memcpy(u->text, text, length);      /* copy utf-text                      */
427         u->text[length] = '\0';
428         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
429
430         utf_hash.entries++;                 /* update number of entries           */
431
432         if (utf_hash.entries > (utf_hash.size * 2)) {
433
434         /* reorganization of hashtable, average length of 
435            the external chains is approx. 2                */  
436
437                 u4 i;
438                 utf *u;
439                 hashtable newhash; /* the new hashtable */
440
441                 /* create new hashtable, double the size */
442                 init_hashtable(&newhash, utf_hash.size * 2);
443                 newhash.entries = utf_hash.entries;
444
445 #ifdef STATISTICS
446                 if (opt_stat)
447                         count_utf_len += sizeof(utf*) * utf_hash.size;
448 #endif
449
450                 /* transfer elements to new hashtable */
451                 for (i = 0; i < utf_hash.size; i++) {
452                         u = (utf *) utf_hash.ptr[i];
453                         while (u) {
454                                 utf *nextu = u->hashlink;
455                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
456                                                 
457                                 u->hashlink = (utf *) newhash.ptr[slot];
458                                 newhash.ptr[slot] = u;
459
460                                 /* follow link in external hash chain */
461                                 u = nextu;
462                         }
463                 }
464         
465                 /* dispose old table */
466                 MFREE(utf_hash.ptr, void*, utf_hash.size);
467                 utf_hash = newhash;
468         }
469
470         return u;
471 }
472
473
474 /* utf_new_u2 ******************************************************************
475
476    Make utf symbol from u2 array, if isclassname is true '.' is
477    replaced by '/'.
478
479 *******************************************************************************/
480
481 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
482 {
483         char *buffer;                   /* memory buffer for  unicode characters  */
484         char *pos;                      /* pointer to current position in buffer  */
485         u4 left;                        /* unicode characters left                */
486         u4 buflength;                   /* utf length in bytes of the u2 array    */
487         utf *result;                    /* resulting utf-string                   */
488         int i;          
489
490         /* determine utf length in bytes and allocate memory */
491
492         buflength = u2_utflength(unicode_pos, unicode_length); 
493         buffer    = MNEW(char, buflength);
494  
495         left = buflength;
496         pos  = buffer;
497
498         for (i = 0; i++ < unicode_length; unicode_pos++) {
499                 /* next unicode character */
500                 u2 c = *unicode_pos;
501                 
502                 if ((c != 0) && (c < 0x80)) {
503                         /* 1 character */       
504                         left--;
505                 if ((int) left < 0) break;
506                         /* convert classname */
507                         if (isclassname && c == '.')
508                                 *pos++ = '/';
509                         else
510                                 *pos++ = (char) c;
511
512                 } else if (c < 0x800) {             
513                         /* 2 characters */                              
514                 unsigned char high = c >> 6;
515                 unsigned char low  = c & 0x3F;
516                         left = left - 2;
517                 if ((int) left < 0) break;
518                 *pos++ = high | 0xC0; 
519                 *pos++ = low  | 0x80;     
520
521                 } else {         
522                 /* 3 characters */                              
523                 char low  = c & 0x3f;
524                 char mid  = (c >> 6) & 0x3F;
525                 char high = c >> 12;
526                         left = left - 3;
527                 if ((int) left < 0) break;
528                 *pos++ = high | 0xE0; 
529                 *pos++ = mid  | 0x80;  
530                 *pos++ = low  | 0x80;   
531                 }
532         }
533         
534         /* insert utf-string into symbol-table */
535         result = utf_new(buffer,buflength);
536
537         MFREE(buffer, char, buflength);
538
539         return result;
540 }
541
542
543 /* utf_new_char ****************************************************************
544
545    Creates a new utf symbol, the text for this symbol is passed as a
546    c-string ( = char* ).
547
548 *******************************************************************************/
549
550 utf *utf_new_char(const char *text)
551 {
552         return utf_new(text, strlen(text));
553 }
554
555
556 /* utf_new_char_classname ******************************************************
557
558    Creates a new utf symbol, the text for this symbol is passed as a
559    c-string ( = char* ) "." characters are going to be replaced by
560    "/". Since the above function is used often, this is a separte
561    function, instead of an if.
562
563 *******************************************************************************/
564
565 utf *utf_new_char_classname(const char *text)
566 {
567         if (strchr(text, '.')) {
568                 char *txt = strdup(text);
569                 char *end = txt + strlen(txt);
570                 char *c;
571                 utf *tmpRes;
572
573                 for (c = txt; c < end; c++)
574                         if (*c == '.') *c = '/';
575
576                 tmpRes = utf_new(txt, strlen(txt));
577                 FREE(txt, 0);
578
579                 return tmpRes;
580
581         } else
582                 return utf_new(text, strlen(text));
583 }
584
585
586 /* utf_nextu2 ******************************************************************
587
588    Read the next unicode character from the utf string and increment
589    the utf-string pointer accordingly.
590
591 *******************************************************************************/
592
593 u2 utf_nextu2(char **utf_ptr)
594 {
595     /* uncompressed unicode character */
596     u2 unicode_char = 0;
597     /* current position in utf text */  
598     unsigned char *utf = (unsigned char *) (*utf_ptr);
599     /* bytes representing the unicode character */
600     unsigned char ch1, ch2, ch3;
601     /* number of bytes used to represent the unicode character */
602     int len = 0;
603         
604     switch ((ch1 = utf[0]) >> 4) {
605         default: /* 1 byte */
606                 (*utf_ptr)++;
607                 return (u2) ch1;
608         case 0xC: 
609         case 0xD: /* 2 bytes */
610                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
611                         unsigned char high = ch1 & 0x1F;
612                         unsigned char low  = ch2 & 0x3F;
613                         unicode_char = (high << 6) + low;
614                         len = 2;
615                 }
616                 break;
617
618         case 0xE: /* 2 or 3 bytes */
619                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
620                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
621                                 unsigned char low  = ch3 & 0x3f;
622                                 unsigned char mid  = ch2 & 0x3f;
623                                 unsigned char high = ch1 & 0x0f;
624                                 unicode_char = (((high << 6) + mid) << 6) + low;
625                                 len = 3;
626                         } else
627                                 len = 2;                                           
628                 }
629                 break;
630     }
631
632     /* update position in utf-text */
633     *utf_ptr = (char *) (utf + len);
634
635     return unicode_char;
636 }
637
638
639 /* utf_strlen ******************************************************************
640
641    Determine number of unicode characters in the utf string.
642
643 *******************************************************************************/
644
645 u4 utf_strlen(utf *u)
646 {
647         char *endpos;                       /* points behind utf string           */
648         char *utf_ptr;                      /* current position in utf text       */
649         u4 len = 0;                         /* number of unicode characters       */
650
651         if (!u) {
652                 *exceptionptr = new_nullpointerexception();
653                 return 0;
654         }
655
656         endpos = utf_end(u);
657         utf_ptr = u->text;
658
659         while (utf_ptr < endpos) {
660                 len++;
661                 /* next unicode character */
662                 utf_nextu2(&utf_ptr);
663         }
664
665         if (utf_ptr != endpos)
666                 /* string ended abruptly */
667                 throw_cacao_exception_exit(string_java_lang_InternalError,
668                                                                    "Illegal utf8 string");
669
670         return len;
671 }
672
673
674 /* u2_utflength ****************************************************************
675
676    Returns the utf length in bytes of a u2 array.
677
678 *******************************************************************************/
679
680 u4 u2_utflength(u2 *text, u4 u2_length)
681 {
682         u4 result_len = 0;                  /* utf length in bytes                */
683         u2 ch;                              /* current unicode character          */
684         u4 len;
685         
686         for (len = 0; len < u2_length; len++) {
687                 /* next unicode character */
688                 ch = *text++;
689           
690                 /* determine bytes required to store unicode character as utf */
691                 if (ch && (ch < 0x80)) 
692                         result_len++;
693                 else if (ch < 0x800)
694                         result_len += 2;        
695                 else 
696                         result_len += 3;        
697         }
698
699     return result_len;
700 }
701
702
703 /* utf_display *****************************************************************
704
705    Write utf symbol to stdout (for debugging purposes).
706
707 *******************************************************************************/
708
709 void utf_display(utf *u)
710 {
711         char *endpos;                       /* points behind utf string           */
712         char *utf_ptr;                      /* current position in utf text       */
713
714         if (!u) {
715                 printf("NULL");
716                 fflush(stdout);
717                 return;
718         }
719
720         endpos = utf_end(u);
721         utf_ptr = u->text;
722
723         while (utf_ptr < endpos) {
724                 /* read next unicode character */                
725                 u2 c = utf_nextu2(&utf_ptr);
726                 if (c >= 32 && c <= 127) printf("%c", c);
727                 else printf("?");
728         }
729
730         fflush(stdout);
731 }
732
733
734 /* utf_display_classname *******************************************************
735
736    Write utf symbol to stdout with `/' converted to `.' (for debugging
737    purposes).
738
739 *******************************************************************************/
740
741 void utf_display_classname(utf *u)
742 {
743         char *endpos;                       /* points behind utf string           */
744         char *utf_ptr;                      /* current position in utf text       */
745
746         if (!u) {
747                 printf("NULL");
748                 fflush(stdout);
749                 return;
750         }
751
752         endpos = utf_end(u);
753         utf_ptr = u->text;
754
755         while (utf_ptr < endpos) {
756                 /* read next unicode character */                
757                 u2 c = utf_nextu2(&utf_ptr);
758                 if (c == '/') c = '.';
759                 if (c >= 32 && c <= 127) printf("%c", c);
760                 else printf("?");
761         }
762
763         fflush(stdout);
764 }
765
766
767 /* utf_sprint ******************************************************************
768         
769    Write utf symbol into c-string (for debugging purposes).
770
771 *******************************************************************************/
772
773 void utf_sprint(char *buffer, utf *u)
774 {
775         char *endpos;                       /* points behind utf string           */
776         char *utf_ptr;                      /* current position in utf text       */
777         u2 pos = 0;                         /* position in c-string               */
778
779         if (!u) {
780                 strcpy(buffer, "NULL");
781                 return;
782         }
783
784         endpos = utf_end(u);
785         utf_ptr = u->text;
786
787         while (utf_ptr < endpos) 
788                 /* copy next unicode character */       
789                 buffer[pos++] = utf_nextu2(&utf_ptr);
790
791         /* terminate string */
792         buffer[pos] = '\0';
793 }
794
795
796 /* utf_sprint_classname ********************************************************
797         
798    Write utf symbol into c-string with `/' converted to `.' (for debugging
799    purposes).
800
801 *******************************************************************************/
802
803 void utf_sprint_classname(char *buffer, utf *u)
804 {
805         char *endpos;                       /* points behind utf string           */
806         char *utf_ptr;                      /* current position in utf text       */
807         u2 pos = 0;                         /* position in c-string               */
808
809         if (!u) {
810                 strcpy(buffer, "NULL");
811                 return;
812         }
813
814         endpos = utf_end(u);
815         utf_ptr = u->text;
816
817         while (utf_ptr < endpos) {
818                 /* copy next unicode character */       
819                 u2 c = utf_nextu2(&utf_ptr);
820                 if (c == '/') c = '.';
821                 buffer[pos++] = c;
822         }
823
824         /* terminate string */
825         buffer[pos] = '\0';
826 }
827
828
829 /* utf_strcat ******************************************************************
830         
831    Like libc strcat, but uses an utf8 string.
832
833 *******************************************************************************/
834
835 void utf_strcat(char *buffer, utf *u)
836 {
837         utf_sprint(buffer + strlen(buffer), u);
838 }
839
840
841 /* utf_strcat_classname ********************************************************
842         
843    Like libc strcat, but uses an utf8 string.
844
845 *******************************************************************************/
846
847 void utf_strcat_classname(char *buffer, utf *u)
848 {
849         utf_sprint_classname(buffer + strlen(buffer), u);
850 }
851
852
853 /* utf_fprint ******************************************************************
854         
855    Write utf symbol into file.
856
857 *******************************************************************************/
858
859 void utf_fprint(FILE *file, utf *u)
860 {
861         char *endpos;                       /* points behind utf string           */
862         char *utf_ptr;                      /* current position in utf text       */
863
864         if (!u)
865                 return;
866
867         endpos = utf_end(u);
868         utf_ptr = u->text;
869
870         while (utf_ptr < endpos) { 
871                 /* read next unicode character */                
872                 u2 c = utf_nextu2(&utf_ptr);                            
873
874                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
875                 else fprintf(file, "?");
876         }
877 }
878
879
880 /* utf_fprint_classname ********************************************************
881         
882    Write utf symbol into file with `/' converted to `.'.
883
884 *******************************************************************************/
885
886 void utf_fprint_classname(FILE *file, utf *u)
887 {
888         char *endpos;                       /* points behind utf string           */
889         char *utf_ptr;                      /* current position in utf text       */
890
891     if (!u)
892                 return;
893
894         endpos = utf_end(u);
895         utf_ptr = u->text;
896
897         while (utf_ptr < endpos) { 
898                 /* read next unicode character */                
899                 u2 c = utf_nextu2(&utf_ptr);                            
900                 if (c == '/') c = '.';
901
902                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
903                 else fprintf(file, "?");
904         }
905 }
906
907
908 /* is_valid_utf ****************************************************************
909
910    Return true if the given string is a valid UTF-8 string.
911
912    utf_ptr...points to first character
913    end_pos...points after last character
914
915 *******************************************************************************/
916
917 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
918
919 bool is_valid_utf(char *utf_ptr, char *end_pos)
920 {
921         int bytes;
922         int len,i;
923         char c;
924         unsigned long v;
925
926         if (end_pos < utf_ptr) return false;
927         bytes = end_pos - utf_ptr;
928         while (bytes--) {
929                 c = *utf_ptr++;
930
931                 if (!c) return false;                     /* 0x00 is not allowed */
932                 if ((c & 0x80) == 0) continue;            /* ASCII */
933
934                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
935                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
936                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
937                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
938                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
939                 else return false;                        /* invalid leading byte */
940
941                 if (len > 2) return false;                /* Java limitation */
942
943                 v = (unsigned long)c & (0x3f >> len);
944                 
945                 if ((bytes -= len) < 0) return false;     /* missing bytes */
946
947                 for (i = len; i--; ) {
948                         c = *utf_ptr++;
949                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
950                                 return false;
951                         v = (v << 6) | (c & 0x3f);
952                 }
953
954                 if (v == 0) {
955                         if (len != 1) return false;           /* Java special */
956
957                 } else {
958                         /* Sun Java seems to allow overlong UTF-8 encodings */
959                         
960                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
961                                 if (!opt_liberalutf)
962                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
963                                 /* XXX change this to panic? */
964                         }
965                 }
966
967                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
968                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
969
970                 /* even these seem to be allowed */
971                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
972         }
973
974         return true;
975 }
976
977
978 /* is_valid_name ***************************************************************
979
980    Return true if the given string may be used as a class/field/method
981    name. (Currently this only disallows empty strings and control
982    characters.)
983
984    NOTE: The string is assumed to have passed is_valid_utf!
985
986    utf_ptr...points to first character
987    end_pos...points after last character
988
989 *******************************************************************************/
990
991 bool is_valid_name(char *utf_ptr, char *end_pos)
992 {
993         if (end_pos <= utf_ptr) return false; /* disallow empty names */
994
995         while (utf_ptr < end_pos) {
996                 unsigned char c = *utf_ptr++;
997
998                 if (c < 0x20) return false; /* disallow control characters */
999                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1000                         return false;
1001         }
1002
1003         return true;
1004 }
1005
1006 bool is_valid_name_utf(utf *u)
1007 {
1008         return is_valid_name(u->text,utf_end(u));
1009 }
1010
1011
1012 /* utf_show ********************************************************************
1013
1014    Writes the utf symbols in the utfhash to stdout and displays the
1015    number of external hash chains grouped according to the chainlength
1016    (for debugging purposes).
1017
1018 *******************************************************************************/
1019
1020 void utf_show(void)
1021 {
1022
1023 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1024
1025         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1026         u4 max_chainlength = 0;      /* maximum length of the chains */
1027         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1028         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1029         u4 i;
1030
1031         printf ("UTF-HASH:\n");
1032
1033         /* show element of utf-hashtable */
1034         for (i=0; i<utf_hash.size; i++) {
1035                 utf *u = utf_hash.ptr[i];
1036                 if (u) {
1037                         printf ("SLOT %d: ", (int) i);
1038                         while (u) {
1039                                 printf ("'");
1040                                 utf_display (u);
1041                                 printf ("' ");
1042                                 u = u->hashlink;
1043                         }       
1044                         printf ("\n");
1045                 }
1046                 
1047         }
1048
1049         printf ("UTF-HASH: %d slots for %d entries\n", 
1050                         (int) utf_hash.size, (int) utf_hash.entries );
1051
1052
1053         if (utf_hash.entries == 0)
1054                 return;
1055
1056         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1057
1058         for (i=0;i<CHAIN_LIMIT;i++)
1059                 chain_count[i]=0;
1060
1061         /* count numbers of hashchains according to their length */
1062         for (i=0; i<utf_hash.size; i++) {
1063                   
1064                 utf *u = (utf*) utf_hash.ptr[i];
1065                 u4 chain_length = 0;
1066
1067                 /* determine chainlength */
1068                 while (u) {
1069                         u = u->hashlink;
1070                         chain_length++;
1071                 }
1072
1073                 /* update sum of all chainlengths */
1074                 sum_chainlength+=chain_length;
1075
1076                 /* determine the maximum length of the chains */
1077                 if (chain_length>max_chainlength)
1078                         max_chainlength = chain_length;
1079
1080                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1081                 if (chain_length>=CHAIN_LIMIT) {
1082                         beyond_limit+=chain_length;
1083                         chain_length=CHAIN_LIMIT-1;
1084                 }
1085
1086                 /* update number of hashchains of current length */
1087                 chain_count[chain_length]++;
1088         }
1089
1090         /* display results */  
1091         for (i=1;i<CHAIN_LIMIT-1;i++) 
1092                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1093           
1094         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1095
1096
1097         printf("max. chainlength:%5d\n",max_chainlength);
1098
1099         /* avg. chainlength = sum of chainlengths / number of chains */
1100         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1101 }
1102
1103
1104 /*
1105  * These are local overrides for various environment variables in Emacs.
1106  * Please do not remove this and leave it at the end of the file, where
1107  * Emacs will automagically detect them.
1108  * ---------------------------------------------------------------------
1109  * Local variables:
1110  * mode: c
1111  * indent-tabs-mode: t
1112  * c-basic-offset: 4
1113  * tab-width: 4
1114  * End:
1115  */