Added: utf_strcat_classname
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2136 2005-03-30 10:03:03Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/tables.h"
45 #include "vm/utf8.h"
46
47
48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
49
50
51 /* utf-symbols for pointer comparison of frequently used strings **************/
52
53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
54
55 utf *utf_java_lang_Class;
56 utf *utf_java_lang_ClassLoader;
57 utf *utf_java_lang_Cloneable;
58 utf *utf_java_lang_SecurityManager;
59 utf *utf_java_lang_String;
60 utf *utf_java_lang_System;
61 utf *utf_java_io_Serializable;
62
63 utf *utf_java_lang_Throwable;
64 utf *utf_java_lang_VMThrowable;
65 utf *utf_java_lang_Exception;
66 utf *utf_java_lang_Error;
67 utf *utf_java_lang_OutOfMemoryError;
68
69 utf* utf_java_lang_Void;
70 utf* utf_java_lang_Boolean;
71 utf* utf_java_lang_Byte;
72 utf* utf_java_lang_Character;
73 utf* utf_java_lang_Short;
74 utf* utf_java_lang_Integer;
75 utf* utf_java_lang_Long;
76 utf* utf_java_lang_Float;
77 utf* utf_java_lang_Double;
78
79 utf *utf_java_util_Vector;
80
81 utf *utf_InnerClasses;                  /* InnerClasses                       */
82 utf *utf_ConstantValue;                 /* ConstantValue                      */
83 utf *utf_Code;                          /* Code                               */
84 utf *utf_Exceptions;                    /* Exceptions                         */
85 utf *utf_LineNumberTable;               /* LineNumberTable                    */
86 utf *utf_SourceFile;                    /* SourceFile                         */
87
88 utf *utf_init;                          /* <init>                             */
89 utf *utf_clinit;                        /* <clinit>                           */
90 utf *utf_finalize;                      /* finalize                           */
91
92 utf *utf_printStackTrace;
93 utf *utf_fillInStackTrace;
94 utf *utf_loadClass;
95
96 utf *utf_void__void;                    /* ()V                                */
97 utf *utf_boolean__void;                 /* (Z)V                               */
98 utf *utf_byte__void;                    /* (B)V                               */
99 utf *utf_char__void;                    /* (C)V                               */
100 utf *utf_short__void;                   /* (S)V                               */
101 utf *utf_int__void;                     /* (I)V                               */
102 utf *utf_long__void;                    /* (J)V                               */
103 utf *utf_float__void;                   /* (F)V                               */
104 utf *utf_double__void;                  /* (D)V                               */
105 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
106 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
107 utf *utf_java_lang_String__java_lang_Class;
108 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
109
110 utf *array_packagename;
111
112
113 /* utf_init ********************************************************************
114
115    Initializes the utf8 subsystem.
116
117 *******************************************************************************/
118
119 void utf8_init(void)
120 {
121         /* create utf-symbols for pointer comparison of frequently used strings */
122
123         utf_java_lang_Object           = utf_new_char("java/lang/Object");
124
125         utf_java_lang_Class            = utf_new_char("java/lang/Class");
126         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
127         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
128         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
129         utf_java_lang_String           = utf_new_char("java/lang/String");
130         utf_java_lang_System           = utf_new_char("java/lang/System");
131         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
132
133         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
134         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
135         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
136         utf_java_lang_Error            = utf_new_char("java/lang/Error");
137         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
138
139         utf_java_lang_Void             = utf_new_char("java/lang/Void");
140         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
141         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
142         utf_java_lang_Character        = utf_new_char("java/lang/Character");
143         utf_java_lang_Short            = utf_new_char("java/lang/Short");
144         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
145         utf_java_lang_Long             = utf_new_char("java/lang/Long");
146         utf_java_lang_Float            = utf_new_char("java/lang/Float");
147         utf_java_lang_Double           = utf_new_char("java/lang/Double");
148
149         utf_java_util_Vector           = utf_new_char("java/util/Vector");
150
151         utf_InnerClasses               = utf_new_char("InnerClasses");
152         utf_ConstantValue              = utf_new_char("ConstantValue");
153         utf_Code                       = utf_new_char("Code");
154         utf_Exceptions                 = utf_new_char("Exceptions");
155         utf_LineNumberTable            = utf_new_char("LineNumberTable");
156         utf_SourceFile                 = utf_new_char("SourceFile");
157
158         utf_init                           = utf_new_char("<init>");
159         utf_clinit                         = utf_new_char("<clinit>");
160         utf_finalize                   = utf_new_char("finalize");
161
162         utf_printStackTrace            = utf_new_char("printStackTrace");
163         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
164         utf_loadClass                  = utf_new_char("loadClass");
165
166         utf_void__void                 = utf_new_char("()V");
167         utf_boolean__void              = utf_new_char("(Z)V");
168         utf_byte__void                 = utf_new_char("(B)V");
169         utf_char__void                 = utf_new_char("(C)V");
170         utf_short__void                = utf_new_char("(S)V");
171         utf_int__void                  = utf_new_char("(I)V");
172         utf_long__void                 = utf_new_char("(J)V");
173         utf_float__void                = utf_new_char("(F)V");
174         utf_double__void               = utf_new_char("(D)V");
175         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
176         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
177
178         utf_java_lang_String__java_lang_Class =
179                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
180
181         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
182
183         array_packagename              = utf_new_char("\t<the array package>");
184 }
185
186
187 /* utf_hashkey *****************************************************************
188
189    The hashkey is computed from the utf-text by using up to 8
190    characters.  For utf-symbols longer than 15 characters 3 characters
191    are taken from the beginning and the end, 2 characters are taken
192    from the middle.
193
194 *******************************************************************************/
195
196 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
197 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
198
199 u4 utf_hashkey(const char *text, u4 length)
200 {
201         const char *start_pos = text;       /* pointer to utf text                */
202         u4 a;
203
204         switch (length) {
205         case 0: /* empty string */
206                 return 0;
207
208         case 1: return fbs(0);
209         case 2: return fbs(0) ^ nbs(3);
210         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
211         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
212         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
213         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
214         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
215         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
216
217         case 9:
218                 a = fbs(0);
219                 a ^= nbs(1);
220                 a ^= nbs(2);
221                 text++;
222                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
223
224         case 10:
225                 a = fbs(0);
226                 text++;
227                 a ^= nbs(2);
228                 a ^= nbs(3);
229                 a ^= nbs(4);
230                 text++;
231                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
232
233         case 11:
234                 a = fbs(0);
235                 text++;
236                 a ^= nbs(2);
237                 a ^= nbs(3);
238                 a ^= nbs(4);
239                 text++;
240                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
241
242         case 12:
243                 a = fbs(0);
244                 text += 2;
245                 a ^= nbs(2);
246                 a ^= nbs(3);
247                 text++;
248                 a ^= nbs(5);
249                 a ^= nbs(6);
250                 a ^= nbs(7);
251                 text++;
252                 return a ^ nbs(9) ^ nbs(10);
253
254         case 13:
255                 a = fbs(0);
256                 a ^= nbs(1);
257                 text++;
258                 a ^= nbs(3);
259                 a ^= nbs(4);
260                 text += 2;      
261                 a ^= nbs(7);
262                 a ^= nbs(8);
263                 text += 2;
264                 return a ^ nbs(9) ^ nbs(10);
265
266         case 14:
267                 a = fbs(0);
268                 text += 2;      
269                 a ^= nbs(3);
270                 a ^= nbs(4);
271                 text += 2;      
272                 a ^= nbs(7);
273                 a ^= nbs(8);
274                 text += 2;
275                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
276
277         case 15:
278                 a = fbs(0);
279                 text += 2;      
280                 a ^= nbs(3);
281                 a ^= nbs(4);
282                 text += 2;      
283                 a ^= nbs(7);
284                 a ^= nbs(8);
285                 text += 2;
286                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
287
288         default:  /* 3 characters from beginning */
289                 a = fbs(0);
290                 text += 2;
291                 a ^= nbs(3);
292                 a ^= nbs(4);
293
294                 /* 2 characters from middle */
295                 text = start_pos + (length / 2);
296                 a ^= fbs(5);
297                 text += 2;
298                 a ^= nbs(6);    
299
300                 /* 3 characters from end */
301                 text = start_pos + length - 4;
302
303                 a ^= fbs(7);
304                 text++;
305
306                 return a ^ nbs(10) ^ nbs(11);
307     }
308 }
309
310
311 /* utf_hashkey *****************************************************************
312
313    Compute the hashkey of a unicode string.
314
315 *******************************************************************************/
316
317 u4 unicode_hashkey(u2 *text, u2 len)
318 {
319         return utf_hashkey((char *) text, len);
320 }
321
322
323 /* utf_new *********************************************************************
324
325    Creates a new utf-symbol, the text of the symbol is passed as a
326    u1-array. The function searches the utf-hashtable for a utf-symbol
327    with this text. On success the element returned, otherwise a new
328    hashtable element is created.
329
330    If the number of entries in the hashtable exceeds twice the size of
331    the hashtable slots a reorganization of the hashtable is done and
332    the utf symbols are copied to a new hashtable with doubled size.
333
334 *******************************************************************************/
335
336 utf *utf_new_intern(const char *text, u2 length);
337
338 utf *utf_new(const char *text, u2 length)
339 {
340     utf *r;
341
342 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
343     tables_lock();
344 #endif
345
346     r = utf_new_intern(text, length);
347
348 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
349     tables_unlock();
350 #endif
351
352     return r;
353 }
354
355
356 utf *utf_new_intern(const char *text, u2 length)
357 {
358         u4 key;                             /* hashkey computed from utf-text     */
359         u4 slot;                            /* slot in hashtable                  */
360         utf *u;                             /* hashtable element                  */
361         u2 i;
362
363 #ifdef STATISTICS
364         if (opt_stat)
365                 count_utf_new++;
366 #endif
367
368         key  = utf_hashkey(text, length);
369         slot = key & (utf_hash.size - 1);
370         u    = utf_hash.ptr[slot];
371
372         /* search external hash chain for utf-symbol */
373         while (u) {
374                 if (u->blength == length) {
375
376                         /* compare text of hashtable elements */
377                         for (i = 0; i < length; i++)
378                                 if (text[i] != u->text[i]) goto nomatch;
379                         
380 #ifdef STATISTICS
381                         if (opt_stat)
382                                 count_utf_new_found++;
383 #endif
384
385                         /* symbol found in hashtable */
386                         return u;
387                 }
388         nomatch:
389                 u = u->hashlink; /* next element in external chain */
390         }
391
392 #ifdef STATISTICS
393         if (opt_stat)
394                 count_utf_len += sizeof(utf) + length;
395 #endif
396
397         /* location in hashtable found, create new utf element */
398         u = NEW(utf);
399         u->blength  = length;               /* length in bytes of utfstring       */
400         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
401         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
402         memcpy(u->text, text, length);      /* copy utf-text                      */
403         u->text[length] = '\0';
404         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
405
406         utf_hash.entries++;                 /* update number of entries           */
407
408         if (utf_hash.entries > (utf_hash.size * 2)) {
409
410         /* reorganization of hashtable, average length of 
411            the external chains is approx. 2                */  
412
413                 u4 i;
414                 utf *u;
415                 hashtable newhash; /* the new hashtable */
416
417                 /* create new hashtable, double the size */
418                 init_hashtable(&newhash, utf_hash.size * 2);
419                 newhash.entries = utf_hash.entries;
420
421 #ifdef STATISTICS
422                 if (opt_stat)
423                         count_utf_len += sizeof(utf*) * utf_hash.size;
424 #endif
425
426                 /* transfer elements to new hashtable */
427                 for (i = 0; i < utf_hash.size; i++) {
428                         u = (utf *) utf_hash.ptr[i];
429                         while (u) {
430                                 utf *nextu = u->hashlink;
431                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
432                                                 
433                                 u->hashlink = (utf *) newhash.ptr[slot];
434                                 newhash.ptr[slot] = u;
435
436                                 /* follow link in external hash chain */
437                                 u = nextu;
438                         }
439                 }
440         
441                 /* dispose old table */
442                 MFREE(utf_hash.ptr, void*, utf_hash.size);
443                 utf_hash = newhash;
444         }
445
446         return u;
447 }
448
449
450 /* utf_new_u2 ******************************************************************
451
452    Make utf symbol from u2 array, if isclassname is true '.' is
453    replaced by '/'.
454
455 *******************************************************************************/
456
457 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
458 {
459         char *buffer;                   /* memory buffer for  unicode characters  */
460         char *pos;                      /* pointer to current position in buffer  */
461         u4 left;                        /* unicode characters left                */
462         u4 buflength;                   /* utf length in bytes of the u2 array    */
463         utf *result;                    /* resulting utf-string                   */
464         int i;          
465
466         /* determine utf length in bytes and allocate memory */
467
468         buflength = u2_utflength(unicode_pos, unicode_length); 
469         buffer    = MNEW(char, buflength);
470  
471         left = buflength;
472         pos  = buffer;
473
474         for (i = 0; i++ < unicode_length; unicode_pos++) {
475                 /* next unicode character */
476                 u2 c = *unicode_pos;
477                 
478                 if ((c != 0) && (c < 0x80)) {
479                         /* 1 character */       
480                         left--;
481                 if ((int) left < 0) break;
482                         /* convert classname */
483                         if (isclassname && c == '.')
484                                 *pos++ = '/';
485                         else
486                                 *pos++ = (char) c;
487
488                 } else if (c < 0x800) {             
489                         /* 2 characters */                              
490                 unsigned char high = c >> 6;
491                 unsigned char low  = c & 0x3F;
492                         left = left - 2;
493                 if ((int) left < 0) break;
494                 *pos++ = high | 0xC0; 
495                 *pos++ = low  | 0x80;     
496
497                 } else {         
498                 /* 3 characters */                              
499                 char low  = c & 0x3f;
500                 char mid  = (c >> 6) & 0x3F;
501                 char high = c >> 12;
502                         left = left - 3;
503                 if ((int) left < 0) break;
504                 *pos++ = high | 0xE0; 
505                 *pos++ = mid  | 0x80;  
506                 *pos++ = low  | 0x80;   
507                 }
508         }
509         
510         /* insert utf-string into symbol-table */
511         result = utf_new(buffer,buflength);
512
513         MFREE(buffer, char, buflength);
514
515         return result;
516 }
517
518
519 /* utf_new_char ****************************************************************
520
521    Creates a new utf symbol, the text for this symbol is passed as a
522    c-string ( = char* ).
523
524 *******************************************************************************/
525
526 utf *utf_new_char(const char *text)
527 {
528         return utf_new(text, strlen(text));
529 }
530
531
532 /* utf_new_char_classname ******************************************************
533
534    Creates a new utf symbol, the text for this symbol is passed as a
535    c-string ( = char* ) "." characters are going to be replaced by
536    "/". Since the above function is used often, this is a separte
537    function, instead of an if.
538
539 *******************************************************************************/
540
541 utf *utf_new_char_classname(const char *text)
542 {
543         if (strchr(text, '.')) {
544                 char *txt = strdup(text);
545                 char *end = txt + strlen(txt);
546                 char *c;
547                 utf *tmpRes;
548
549                 for (c = txt; c < end; c++)
550                         if (*c == '.') *c = '/';
551
552                 tmpRes = utf_new(txt, strlen(txt));
553                 FREE(txt, 0);
554
555                 return tmpRes;
556
557         } else
558                 return utf_new(text, strlen(text));
559 }
560
561
562 /* utf_nextu2 ******************************************************************
563
564    Read the next unicode character from the utf string and increment
565    the utf-string pointer accordingly.
566
567 *******************************************************************************/
568
569 u2 utf_nextu2(char **utf_ptr)
570 {
571     /* uncompressed unicode character */
572     u2 unicode_char = 0;
573     /* current position in utf text */  
574     unsigned char *utf = (unsigned char *) (*utf_ptr);
575     /* bytes representing the unicode character */
576     unsigned char ch1, ch2, ch3;
577     /* number of bytes used to represent the unicode character */
578     int len = 0;
579         
580     switch ((ch1 = utf[0]) >> 4) {
581         default: /* 1 byte */
582                 (*utf_ptr)++;
583                 return (u2) ch1;
584         case 0xC: 
585         case 0xD: /* 2 bytes */
586                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
587                         unsigned char high = ch1 & 0x1F;
588                         unsigned char low  = ch2 & 0x3F;
589                         unicode_char = (high << 6) + low;
590                         len = 2;
591                 }
592                 break;
593
594         case 0xE: /* 2 or 3 bytes */
595                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
596                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
597                                 unsigned char low  = ch3 & 0x3f;
598                                 unsigned char mid  = ch2 & 0x3f;
599                                 unsigned char high = ch1 & 0x0f;
600                                 unicode_char = (((high << 6) + mid) << 6) + low;
601                                 len = 3;
602                         } else
603                                 len = 2;                                           
604                 }
605                 break;
606     }
607
608     /* update position in utf-text */
609     *utf_ptr = (char *) (utf + len);
610
611     return unicode_char;
612 }
613
614
615 /* utf_strlen ******************************************************************
616
617    Determine number of unicode characters in the utf string.
618
619 *******************************************************************************/
620
621 u4 utf_strlen(utf *u)
622 {
623         char *endpos;                       /* points behind utf string           */
624         char *utf_ptr;                      /* current position in utf text       */
625         u4 len = 0;                         /* number of unicode characters       */
626
627         if (!u) {
628                 *exceptionptr = new_nullpointerexception();
629                 return 0;
630         }
631
632         endpos = utf_end(u);
633         utf_ptr = u->text;
634
635         while (utf_ptr < endpos) {
636                 len++;
637                 /* next unicode character */
638                 utf_nextu2(&utf_ptr);
639         }
640
641         if (utf_ptr != endpos)
642                 /* string ended abruptly */
643                 throw_cacao_exception_exit(string_java_lang_InternalError,
644                                                                    "Illegal utf8 string");
645
646         return len;
647 }
648
649
650 /* u2_utflength ****************************************************************
651
652    Returns the utf length in bytes of a u2 array.
653
654 *******************************************************************************/
655
656 u4 u2_utflength(u2 *text, u4 u2_length)
657 {
658         u4 result_len = 0;                  /* utf length in bytes                */
659         u2 ch;                              /* current unicode character          */
660         u4 len;
661         
662         for (len = 0; len < u2_length; len++) {
663                 /* next unicode character */
664                 ch = *text++;
665           
666                 /* determine bytes required to store unicode character as utf */
667                 if (ch && (ch < 0x80)) 
668                         result_len++;
669                 else if (ch < 0x800)
670                         result_len += 2;        
671                 else 
672                         result_len += 3;        
673         }
674
675     return result_len;
676 }
677
678
679 /* utf_display *****************************************************************
680
681    Write utf symbol to stdout (for debugging purposes).
682
683 *******************************************************************************/
684
685 void utf_display(utf *u)
686 {
687         char *endpos;                       /* points behind utf string           */
688         char *utf_ptr;                      /* current position in utf text       */
689
690         if (!u) {
691                 printf("NULL");
692                 fflush(stdout);
693                 return;
694         }
695
696         endpos = utf_end(u);
697         utf_ptr = u->text;
698
699         while (utf_ptr < endpos) {
700                 /* read next unicode character */                
701                 u2 c = utf_nextu2(&utf_ptr);
702                 if (c >= 32 && c <= 127) printf("%c", c);
703                 else printf("?");
704         }
705
706         fflush(stdout);
707 }
708
709
710 /* utf_display_classname *******************************************************
711
712    Write utf symbol to stdout with `/' converted to `.' (for debugging
713    purposes).
714
715 *******************************************************************************/
716
717 void utf_display_classname(utf *u)
718 {
719         char *endpos;                       /* points behind utf string           */
720         char *utf_ptr;                      /* current position in utf text       */
721
722         if (!u) {
723                 printf("NULL");
724                 fflush(stdout);
725                 return;
726         }
727
728         endpos = utf_end(u);
729         utf_ptr = u->text;
730
731         while (utf_ptr < endpos) {
732                 /* read next unicode character */                
733                 u2 c = utf_nextu2(&utf_ptr);
734                 if (c == '/') c = '.';
735                 if (c >= 32 && c <= 127) printf("%c", c);
736                 else printf("?");
737         }
738
739         fflush(stdout);
740 }
741
742
743 /* utf_sprint ******************************************************************
744         
745    Write utf symbol into c-string (for debugging purposes).
746
747 *******************************************************************************/
748
749 void utf_sprint(char *buffer, utf *u)
750 {
751         char *endpos;                       /* points behind utf string           */
752         char *utf_ptr;                      /* current position in utf text       */
753         u2 pos = 0;                         /* position in c-string               */
754
755         if (!u) {
756                 strcpy(buffer, "NULL");
757                 return;
758         }
759
760         endpos = utf_end(u);
761         utf_ptr = u->text;
762
763         while (utf_ptr < endpos) 
764                 /* copy next unicode character */       
765                 buffer[pos++] = utf_nextu2(&utf_ptr);
766
767         /* terminate string */
768         buffer[pos] = '\0';
769 }
770
771
772 /* utf_sprint_classname ********************************************************
773         
774    Write utf symbol into c-string with `/' converted to `.' (for debugging
775    purposes).
776
777 *******************************************************************************/
778
779 void utf_sprint_classname(char *buffer, utf *u)
780 {
781         char *endpos;                       /* points behind utf string           */
782         char *utf_ptr;                      /* current position in utf text       */
783         u2 pos = 0;                         /* position in c-string               */
784
785         if (!u) {
786                 strcpy(buffer, "NULL");
787                 return;
788         }
789
790         endpos = utf_end(u);
791         utf_ptr = u->text;
792
793         while (utf_ptr < endpos) {
794                 /* copy next unicode character */       
795                 u2 c = utf_nextu2(&utf_ptr);
796                 if (c == '/') c = '.';
797                 buffer[pos++] = c;
798         }
799
800         /* terminate string */
801         buffer[pos] = '\0';
802 }
803
804
805 /* utf_strcat ******************************************************************
806         
807    Like libc strcat, but uses an utf8 string.
808
809 *******************************************************************************/
810
811 void utf_strcat(char *buffer, utf *u)
812 {
813         utf_sprint(buffer + strlen(buffer), u);
814 }
815
816
817 /* utf_strcat_classname ********************************************************
818         
819    Like libc strcat, but uses an utf8 string.
820
821 *******************************************************************************/
822
823 void utf_strcat_classname(char *buffer, utf *u)
824 {
825         utf_sprint_classname(buffer + strlen(buffer), u);
826 }
827
828
829 /* utf_fprint ******************************************************************
830         
831    Write utf symbol into file.
832
833 *******************************************************************************/
834
835 void utf_fprint(FILE *file, utf *u)
836 {
837         char *endpos;                       /* points behind utf string           */
838         char *utf_ptr;                      /* current position in utf text       */
839
840         if (!u)
841                 return;
842
843         endpos = utf_end(u);
844         utf_ptr = u->text;
845
846         while (utf_ptr < endpos) { 
847                 /* read next unicode character */                
848                 u2 c = utf_nextu2(&utf_ptr);                            
849
850                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
851                 else fprintf(file, "?");
852         }
853 }
854
855
856 /* utf_fprint_classname ********************************************************
857         
858    Write utf symbol into file with `/' converted to `.'.
859
860 *******************************************************************************/
861
862 void utf_fprint_classname(FILE *file, utf *u)
863 {
864         char *endpos;                       /* points behind utf string           */
865         char *utf_ptr;                      /* current position in utf text       */
866
867     if (!u)
868                 return;
869
870         endpos = utf_end(u);
871         utf_ptr = u->text;
872
873         while (utf_ptr < endpos) { 
874                 /* read next unicode character */                
875                 u2 c = utf_nextu2(&utf_ptr);                            
876                 if (c == '/') c = '.';
877
878                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
879                 else fprintf(file, "?");
880         }
881 }
882
883
884 /* is_valid_utf ****************************************************************
885
886    Return true if the given string is a valid UTF-8 string.
887
888    utf_ptr...points to first character
889    end_pos...points after last character
890
891 *******************************************************************************/
892
893 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
894
895 bool is_valid_utf(char *utf_ptr, char *end_pos)
896 {
897         int bytes;
898         int len,i;
899         char c;
900         unsigned long v;
901
902         if (end_pos < utf_ptr) return false;
903         bytes = end_pos - utf_ptr;
904         while (bytes--) {
905                 c = *utf_ptr++;
906
907                 if (!c) return false;                     /* 0x00 is not allowed */
908                 if ((c & 0x80) == 0) continue;            /* ASCII */
909
910                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
911                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
912                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
913                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
914                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
915                 else return false;                        /* invalid leading byte */
916
917                 if (len > 2) return false;                /* Java limitation */
918
919                 v = (unsigned long)c & (0x3f >> len);
920                 
921                 if ((bytes -= len) < 0) return false;     /* missing bytes */
922
923                 for (i = len; i--; ) {
924                         c = *utf_ptr++;
925                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
926                                 return false;
927                         v = (v << 6) | (c & 0x3f);
928                 }
929
930                 if (v == 0) {
931                         if (len != 1) return false;           /* Java special */
932
933                 } else {
934                         /* Sun Java seems to allow overlong UTF-8 encodings */
935                         
936                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
937                                 if (!opt_liberalutf)
938                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
939                                 /* XXX change this to panic? */
940                         }
941                 }
942
943                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
944                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
945
946                 /* even these seem to be allowed */
947                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
948         }
949
950         return true;
951 }
952
953
954 /* is_valid_name ***************************************************************
955
956    Return true if the given string may be used as a class/field/method
957    name. (Currently this only disallows empty strings and control
958    characters.)
959
960    NOTE: The string is assumed to have passed is_valid_utf!
961
962    utf_ptr...points to first character
963    end_pos...points after last character
964
965 *******************************************************************************/
966
967 bool is_valid_name(char *utf_ptr, char *end_pos)
968 {
969         if (end_pos <= utf_ptr) return false; /* disallow empty names */
970
971         while (utf_ptr < end_pos) {
972                 unsigned char c = *utf_ptr++;
973
974                 if (c < 0x20) return false; /* disallow control characters */
975                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
976                         return false;
977         }
978
979         return true;
980 }
981
982 bool is_valid_name_utf(utf *u)
983 {
984         return is_valid_name(u->text,utf_end(u));
985 }
986
987
988 /* utf_show ********************************************************************
989
990    Writes the utf symbols in the utfhash to stdout and displays the
991    number of external hash chains grouped according to the chainlength
992    (for debugging purposes).
993
994 *******************************************************************************/
995
996 void utf_show(void)
997 {
998
999 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1000
1001         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1002         u4 max_chainlength = 0;      /* maximum length of the chains */
1003         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1004         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1005         u4 i;
1006
1007         printf ("UTF-HASH:\n");
1008
1009         /* show element of utf-hashtable */
1010         for (i=0; i<utf_hash.size; i++) {
1011                 utf *u = utf_hash.ptr[i];
1012                 if (u) {
1013                         printf ("SLOT %d: ", (int) i);
1014                         while (u) {
1015                                 printf ("'");
1016                                 utf_display (u);
1017                                 printf ("' ");
1018                                 u = u->hashlink;
1019                         }       
1020                         printf ("\n");
1021                 }
1022                 
1023         }
1024
1025         printf ("UTF-HASH: %d slots for %d entries\n", 
1026                         (int) utf_hash.size, (int) utf_hash.entries );
1027
1028
1029         if (utf_hash.entries == 0)
1030                 return;
1031
1032         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1033
1034         for (i=0;i<CHAIN_LIMIT;i++)
1035                 chain_count[i]=0;
1036
1037         /* count numbers of hashchains according to their length */
1038         for (i=0; i<utf_hash.size; i++) {
1039                   
1040                 utf *u = (utf*) utf_hash.ptr[i];
1041                 u4 chain_length = 0;
1042
1043                 /* determine chainlength */
1044                 while (u) {
1045                         u = u->hashlink;
1046                         chain_length++;
1047                 }
1048
1049                 /* update sum of all chainlengths */
1050                 sum_chainlength+=chain_length;
1051
1052                 /* determine the maximum length of the chains */
1053                 if (chain_length>max_chainlength)
1054                         max_chainlength = chain_length;
1055
1056                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1057                 if (chain_length>=CHAIN_LIMIT) {
1058                         beyond_limit+=chain_length;
1059                         chain_length=CHAIN_LIMIT-1;
1060                 }
1061
1062                 /* update number of hashchains of current length */
1063                 chain_count[chain_length]++;
1064         }
1065
1066         /* display results */  
1067         for (i=1;i<CHAIN_LIMIT-1;i++) 
1068                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1069           
1070         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1071
1072
1073         printf("max. chainlength:%5d\n",max_chainlength);
1074
1075         /* avg. chainlength = sum of chainlengths / number of chains */
1076         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1077 }
1078
1079
1080 /*
1081  * These are local overrides for various environment variables in Emacs.
1082  * Please do not remove this and leave it at the end of the file, where
1083  * Emacs will automagically detect them.
1084  * ---------------------------------------------------------------------
1085  * Local variables:
1086  * mode: c
1087  * indent-tabs-mode: t
1088  * c-basic-offset: 4
1089  * tab-width: 4
1090  * End:
1091  */