made array_packagename harder to spoof
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2097 2005-03-27 18:59:15Z edwin $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/tables.h"
45 #include "vm/utf8.h"
46
47
48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
49
50
51 /* utf-symbols for pointer comparison of frequently used strings **************/
52
53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
54
55 utf *utf_java_lang_Class;
56 utf *utf_java_lang_ClassLoader;
57 utf *utf_java_lang_Cloneable;
58 utf *utf_java_lang_SecurityManager;
59 utf *utf_java_lang_String;
60 utf *utf_java_lang_System;
61 utf *utf_java_io_Serializable;
62
63 utf *utf_java_lang_Throwable;
64 utf *utf_java_lang_VMThrowable;
65 utf *utf_java_lang_Exception;
66 utf *utf_java_lang_Error;
67 utf *utf_java_lang_OutOfMemoryError;
68
69 utf* utf_java_lang_Void;
70 utf* utf_java_lang_Boolean;
71 utf* utf_java_lang_Byte;
72 utf* utf_java_lang_Character;
73 utf* utf_java_lang_Short;
74 utf* utf_java_lang_Integer;
75 utf* utf_java_lang_Long;
76 utf* utf_java_lang_Float;
77 utf* utf_java_lang_Double;
78
79 utf *utf_java_util_Vector;
80
81 utf *utf_InnerClasses;                  /* InnerClasses                       */
82 utf *utf_ConstantValue;                 /* ConstantValue                      */
83 utf *utf_Code;                          /* Code                               */
84 utf *utf_Exceptions;                    /* Exceptions                         */
85 utf *utf_LineNumberTable;               /* LineNumberTable                    */
86 utf *utf_SourceFile;                    /* SourceFile                         */
87
88 utf *utf_init;                          /* <init>                             */
89 utf *utf_clinit;                        /* <clinit>                           */
90 utf *utf_finalize;                      /* finalize                           */
91
92 utf *utf_printStackTrace;
93 utf *utf_fillInStackTrace;
94 utf *utf_loadClass;
95
96 utf *utf_void__void;                    /* ()V                                */
97 utf *utf_boolean__void;                 /* (Z)V                               */
98 utf *utf_byte__void;                    /* (B)V                               */
99 utf *utf_char__void;                    /* (C)V                               */
100 utf *utf_short__void;                   /* (S)V                               */
101 utf *utf_int__void;                     /* (I)V                               */
102 utf *utf_long__void;                    /* (J)V                               */
103 utf *utf_float__void;                   /* (F)V                               */
104 utf *utf_double__void;                  /* (D)V                               */
105 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
106 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
107 utf *utf_java_lang_String__java_lang_Class;
108 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
109
110 utf *array_packagename;
111
112
113 /* utf_init ********************************************************************
114
115    Initializes the utf8 subsystem.
116
117 *******************************************************************************/
118
119 void utf8_init(void)
120 {
121         /* create utf-symbols for pointer comparison of frequently used strings */
122
123         utf_java_lang_Object           = utf_new_char("java/lang/Object");
124
125         utf_java_lang_Class            = utf_new_char("java/lang/Class");
126         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
127         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
128         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
129         utf_java_lang_String           = utf_new_char("java/lang/String");
130         utf_java_lang_System           = utf_new_char("java/lang/System");
131         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
132
133         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
134         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
135         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
136         utf_java_lang_Error            = utf_new_char("java/lang/Error");
137         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
138
139         utf_java_lang_Void             = utf_new_char("java/lang/Void");
140         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
141         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
142         utf_java_lang_Character        = utf_new_char("java/lang/Character");
143         utf_java_lang_Short            = utf_new_char("java/lang/Short");
144         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
145         utf_java_lang_Long             = utf_new_char("java/lang/Long");
146         utf_java_lang_Float            = utf_new_char("java/lang/Float");
147         utf_java_lang_Double           = utf_new_char("java/lang/Double");
148
149         utf_java_util_Vector           = utf_new_char("java/util/Vector");
150
151         utf_InnerClasses               = utf_new_char("InnerClasses");
152         utf_ConstantValue              = utf_new_char("ConstantValue");
153         utf_Code                       = utf_new_char("Code");
154         utf_Exceptions                 = utf_new_char("Exceptions");
155         utf_LineNumberTable            = utf_new_char("LineNumberTable");
156         utf_SourceFile                 = utf_new_char("SourceFile");
157
158         utf_init                           = utf_new_char("<init>");
159         utf_clinit                         = utf_new_char("<clinit>");
160         utf_finalize                   = utf_new_char("finalize");
161
162         utf_printStackTrace            = utf_new_char("printStackTrace");
163         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
164         utf_loadClass                  = utf_new_char("loadClass");
165
166         utf_void__void                 = utf_new_char("()V");
167         utf_boolean__void              = utf_new_char("(Z)V");
168         utf_byte__void                 = utf_new_char("(B)V");
169         utf_char__void                 = utf_new_char("(C)V");
170         utf_short__void                = utf_new_char("(S)V");
171         utf_int__void                  = utf_new_char("(I)V");
172         utf_long__void                 = utf_new_char("(J)V");
173         utf_float__void                = utf_new_char("(F)V");
174         utf_double__void               = utf_new_char("(D)V");
175         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
176         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
177
178         utf_java_lang_String__java_lang_Class =
179                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
180
181         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
182         array_packagename              = utf_new_char("\t<the array package>");
183 }
184
185
186 /* utf_hashkey *****************************************************************
187
188    The hashkey is computed from the utf-text by using up to 8
189    characters.  For utf-symbols longer than 15 characters 3 characters
190    are taken from the beginning and the end, 2 characters are taken
191    from the middle.
192
193 *******************************************************************************/
194
195 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
196 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
197
198 u4 utf_hashkey(const char *text, u4 length)
199 {
200         const char *start_pos = text;       /* pointer to utf text                */
201         u4 a;
202
203         switch (length) {
204         case 0: /* empty string */
205                 return 0;
206
207         case 1: return fbs(0);
208         case 2: return fbs(0) ^ nbs(3);
209         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
210         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
211         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
212         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
213         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
214         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
215
216         case 9:
217                 a = fbs(0);
218                 a ^= nbs(1);
219                 a ^= nbs(2);
220                 text++;
221                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
222
223         case 10:
224                 a = fbs(0);
225                 text++;
226                 a ^= nbs(2);
227                 a ^= nbs(3);
228                 a ^= nbs(4);
229                 text++;
230                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
231
232         case 11:
233                 a = fbs(0);
234                 text++;
235                 a ^= nbs(2);
236                 a ^= nbs(3);
237                 a ^= nbs(4);
238                 text++;
239                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
240
241         case 12:
242                 a = fbs(0);
243                 text += 2;
244                 a ^= nbs(2);
245                 a ^= nbs(3);
246                 text++;
247                 a ^= nbs(5);
248                 a ^= nbs(6);
249                 a ^= nbs(7);
250                 text++;
251                 return a ^ nbs(9) ^ nbs(10);
252
253         case 13:
254                 a = fbs(0);
255                 a ^= nbs(1);
256                 text++;
257                 a ^= nbs(3);
258                 a ^= nbs(4);
259                 text += 2;      
260                 a ^= nbs(7);
261                 a ^= nbs(8);
262                 text += 2;
263                 return a ^ nbs(9) ^ nbs(10);
264
265         case 14:
266                 a = fbs(0);
267                 text += 2;      
268                 a ^= nbs(3);
269                 a ^= nbs(4);
270                 text += 2;      
271                 a ^= nbs(7);
272                 a ^= nbs(8);
273                 text += 2;
274                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
275
276         case 15:
277                 a = fbs(0);
278                 text += 2;      
279                 a ^= nbs(3);
280                 a ^= nbs(4);
281                 text += 2;      
282                 a ^= nbs(7);
283                 a ^= nbs(8);
284                 text += 2;
285                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
286
287         default:  /* 3 characters from beginning */
288                 a = fbs(0);
289                 text += 2;
290                 a ^= nbs(3);
291                 a ^= nbs(4);
292
293                 /* 2 characters from middle */
294                 text = start_pos + (length / 2);
295                 a ^= fbs(5);
296                 text += 2;
297                 a ^= nbs(6);    
298
299                 /* 3 characters from end */
300                 text = start_pos + length - 4;
301
302                 a ^= fbs(7);
303                 text++;
304
305                 return a ^ nbs(10) ^ nbs(11);
306     }
307 }
308
309
310 /* utf_hashkey *****************************************************************
311
312    Compute the hashkey of a unicode string.
313
314 *******************************************************************************/
315
316 u4 unicode_hashkey(u2 *text, u2 len)
317 {
318         return utf_hashkey((char *) text, len);
319 }
320
321
322 /* utf_new *********************************************************************
323
324    Creates a new utf-symbol, the text of the symbol is passed as a
325    u1-array. The function searches the utf-hashtable for a utf-symbol
326    with this text. On success the element returned, otherwise a new
327    hashtable element is created.
328
329    If the number of entries in the hashtable exceeds twice the size of
330    the hashtable slots a reorganization of the hashtable is done and
331    the utf symbols are copied to a new hashtable with doubled size.
332
333 *******************************************************************************/
334
335 utf *utf_new_intern(const char *text, u2 length);
336
337 utf *utf_new(const char *text, u2 length)
338 {
339     utf *r;
340
341 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
342     tables_lock();
343 #endif
344
345     r = utf_new_intern(text, length);
346
347 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
348     tables_unlock();
349 #endif
350
351     return r;
352 }
353
354
355 utf *utf_new_intern(const char *text, u2 length)
356 {
357         u4 key;                             /* hashkey computed from utf-text     */
358         u4 slot;                            /* slot in hashtable                  */
359         utf *u;                             /* hashtable element                  */
360         u2 i;
361
362 #ifdef STATISTICS
363         if (opt_stat)
364                 count_utf_new++;
365 #endif
366
367         key  = utf_hashkey(text, length);
368         slot = key & (utf_hash.size - 1);
369         u    = utf_hash.ptr[slot];
370
371         /* search external hash chain for utf-symbol */
372         while (u) {
373                 if (u->blength == length) {
374
375                         /* compare text of hashtable elements */
376                         for (i = 0; i < length; i++)
377                                 if (text[i] != u->text[i]) goto nomatch;
378                         
379 #ifdef STATISTICS
380                         if (opt_stat)
381                                 count_utf_new_found++;
382 #endif
383
384                         /* symbol found in hashtable */
385                         return u;
386                 }
387         nomatch:
388                 u = u->hashlink; /* next element in external chain */
389         }
390
391 #ifdef STATISTICS
392         if (opt_stat)
393                 count_utf_len += sizeof(utf) + length;
394 #endif
395
396         /* location in hashtable found, create new utf element */
397         u = NEW(utf);
398         u->blength  = length;               /* length in bytes of utfstring       */
399         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
400         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
401         memcpy(u->text, text, length);      /* copy utf-text                      */
402         u->text[length] = '\0';
403         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
404
405         utf_hash.entries++;                 /* update number of entries           */
406
407         if (utf_hash.entries > (utf_hash.size * 2)) {
408
409         /* reorganization of hashtable, average length of 
410            the external chains is approx. 2                */  
411
412                 u4 i;
413                 utf *u;
414                 hashtable newhash; /* the new hashtable */
415
416                 /* create new hashtable, double the size */
417                 init_hashtable(&newhash, utf_hash.size * 2);
418                 newhash.entries = utf_hash.entries;
419
420 #ifdef STATISTICS
421                 if (opt_stat)
422                         count_utf_len += sizeof(utf*) * utf_hash.size;
423 #endif
424
425                 /* transfer elements to new hashtable */
426                 for (i = 0; i < utf_hash.size; i++) {
427                         u = (utf *) utf_hash.ptr[i];
428                         while (u) {
429                                 utf *nextu = u->hashlink;
430                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
431                                                 
432                                 u->hashlink = (utf *) newhash.ptr[slot];
433                                 newhash.ptr[slot] = u;
434
435                                 /* follow link in external hash chain */
436                                 u = nextu;
437                         }
438                 }
439         
440                 /* dispose old table */
441                 MFREE(utf_hash.ptr, void*, utf_hash.size);
442                 utf_hash = newhash;
443         }
444
445         return u;
446 }
447
448
449 /* utf_new_u2 ******************************************************************
450
451    Make utf symbol from u2 array, if isclassname is true '.' is
452    replaced by '/'.
453
454 *******************************************************************************/
455
456 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
457 {
458         char *buffer;                   /* memory buffer for  unicode characters  */
459         char *pos;                      /* pointer to current position in buffer  */
460         u4 left;                        /* unicode characters left                */
461         u4 buflength;                   /* utf length in bytes of the u2 array    */
462         utf *result;                    /* resulting utf-string                   */
463         int i;          
464
465         /* determine utf length in bytes and allocate memory */
466
467         buflength = u2_utflength(unicode_pos, unicode_length); 
468         buffer    = MNEW(char, buflength);
469  
470         left = buflength;
471         pos  = buffer;
472
473         for (i = 0; i++ < unicode_length; unicode_pos++) {
474                 /* next unicode character */
475                 u2 c = *unicode_pos;
476                 
477                 if ((c != 0) && (c < 0x80)) {
478                         /* 1 character */       
479                         left--;
480                 if ((int) left < 0) break;
481                         /* convert classname */
482                         if (isclassname && c == '.')
483                                 *pos++ = '/';
484                         else
485                                 *pos++ = (char) c;
486
487                 } else if (c < 0x800) {             
488                         /* 2 characters */                              
489                 unsigned char high = c >> 6;
490                 unsigned char low  = c & 0x3F;
491                         left = left - 2;
492                 if ((int) left < 0) break;
493                 *pos++ = high | 0xC0; 
494                 *pos++ = low  | 0x80;     
495
496                 } else {         
497                 /* 3 characters */                              
498                 char low  = c & 0x3f;
499                 char mid  = (c >> 6) & 0x3F;
500                 char high = c >> 12;
501                         left = left - 3;
502                 if ((int) left < 0) break;
503                 *pos++ = high | 0xE0; 
504                 *pos++ = mid  | 0x80;  
505                 *pos++ = low  | 0x80;   
506                 }
507         }
508         
509         /* insert utf-string into symbol-table */
510         result = utf_new(buffer,buflength);
511
512         MFREE(buffer, char, buflength);
513
514         return result;
515 }
516
517
518 /* utf_new_char ****************************************************************
519
520    Creates a new utf symbol, the text for this symbol is passed as a
521    c-string ( = char* ).
522
523 *******************************************************************************/
524
525 utf *utf_new_char(const char *text)
526 {
527         return utf_new(text, strlen(text));
528 }
529
530
531 /* utf_new_char_classname ******************************************************
532
533    Creates a new utf symbol, the text for this symbol is passed as a
534    c-string ( = char* ) "." characters are going to be replaced by
535    "/". Since the above function is used often, this is a separte
536    function, instead of an if.
537
538 *******************************************************************************/
539
540 utf *utf_new_char_classname(const char *text)
541 {
542         if (strchr(text, '.')) {
543                 char *txt = strdup(text);
544                 char *end = txt + strlen(txt);
545                 char *c;
546                 utf *tmpRes;
547
548                 for (c = txt; c < end; c++)
549                         if (*c == '.') *c = '/';
550
551                 tmpRes = utf_new(txt, strlen(txt));
552                 FREE(txt, 0);
553
554                 return tmpRes;
555
556         } else
557                 return utf_new(text, strlen(text));
558 }
559
560
561 /* utf_nextu2 ******************************************************************
562
563    Read the next unicode character from the utf string and increment
564    the utf-string pointer accordingly.
565
566 *******************************************************************************/
567
568 u2 utf_nextu2(char **utf_ptr)
569 {
570     /* uncompressed unicode character */
571     u2 unicode_char = 0;
572     /* current position in utf text */  
573     unsigned char *utf = (unsigned char *) (*utf_ptr);
574     /* bytes representing the unicode character */
575     unsigned char ch1, ch2, ch3;
576     /* number of bytes used to represent the unicode character */
577     int len = 0;
578         
579     switch ((ch1 = utf[0]) >> 4) {
580         default: /* 1 byte */
581                 (*utf_ptr)++;
582                 return (u2) ch1;
583         case 0xC: 
584         case 0xD: /* 2 bytes */
585                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
586                         unsigned char high = ch1 & 0x1F;
587                         unsigned char low  = ch2 & 0x3F;
588                         unicode_char = (high << 6) + low;
589                         len = 2;
590                 }
591                 break;
592
593         case 0xE: /* 2 or 3 bytes */
594                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
595                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
596                                 unsigned char low  = ch3 & 0x3f;
597                                 unsigned char mid  = ch2 & 0x3f;
598                                 unsigned char high = ch1 & 0x0f;
599                                 unicode_char = (((high << 6) + mid) << 6) + low;
600                                 len = 3;
601                         } else
602                                 len = 2;                                           
603                 }
604                 break;
605     }
606
607     /* update position in utf-text */
608     *utf_ptr = (char *) (utf + len);
609
610     return unicode_char;
611 }
612
613
614 /* utf_strlen ******************************************************************
615
616    Determine number of unicode characters in the utf string.
617
618 *******************************************************************************/
619
620 u4 utf_strlen(utf *u)
621 {
622         char *endpos;                       /* points behind utf string           */
623         char *utf_ptr;                      /* current position in utf text       */
624         u4 len = 0;                         /* number of unicode characters       */
625
626         if (!u) {
627                 *exceptionptr = new_nullpointerexception();
628                 return 0;
629         }
630
631         endpos = utf_end(u);
632         utf_ptr = u->text;
633
634         while (utf_ptr < endpos) {
635                 len++;
636                 /* next unicode character */
637                 utf_nextu2(&utf_ptr);
638         }
639
640         if (utf_ptr != endpos)
641                 /* string ended abruptly */
642                 throw_cacao_exception_exit(string_java_lang_InternalError,
643                                                                    "Illegal utf8 string");
644
645         return len;
646 }
647
648
649 /* u2_utflength ****************************************************************
650
651    Returns the utf length in bytes of a u2 array.
652
653 *******************************************************************************/
654
655 u4 u2_utflength(u2 *text, u4 u2_length)
656 {
657         u4 result_len = 0;                  /* utf length in bytes                */
658         u2 ch;                              /* current unicode character          */
659         u4 len;
660         
661         for (len = 0; len < u2_length; len++) {
662                 /* next unicode character */
663                 ch = *text++;
664           
665                 /* determine bytes required to store unicode character as utf */
666                 if (ch && (ch < 0x80)) 
667                         result_len++;
668                 else if (ch < 0x800)
669                         result_len += 2;        
670                 else 
671                         result_len += 3;        
672         }
673
674     return result_len;
675 }
676
677
678 /* utf_display *****************************************************************
679
680    Write utf symbol to stdout (for debugging purposes).
681
682 *******************************************************************************/
683
684 void utf_display(utf *u)
685 {
686         char *endpos;                       /* points behind utf string           */
687         char *utf_ptr;                      /* current position in utf text       */
688
689         if (!u) {
690                 printf("NULL");
691                 fflush(stdout);
692                 return;
693         }
694
695         endpos = utf_end(u);
696         utf_ptr = u->text;
697
698         while (utf_ptr < endpos) {
699                 /* read next unicode character */                
700                 u2 c = utf_nextu2(&utf_ptr);
701                 if (c >= 32 && c <= 127) printf("%c", c);
702                 else printf("?");
703         }
704
705         fflush(stdout);
706 }
707
708
709 /* utf_display_classname *******************************************************
710
711    Write utf symbol to stdout with `/' converted to `.' (for debugging
712    purposes).
713
714 *******************************************************************************/
715
716 void utf_display_classname(utf *u)
717 {
718         char *endpos;                       /* points behind utf string           */
719         char *utf_ptr;                      /* current position in utf text       */
720
721         if (!u) {
722                 printf("NULL");
723                 fflush(stdout);
724                 return;
725         }
726
727         endpos = utf_end(u);
728         utf_ptr = u->text;
729
730         while (utf_ptr < endpos) {
731                 /* read next unicode character */                
732                 u2 c = utf_nextu2(&utf_ptr);
733                 if (c == '/') c = '.';
734                 if (c >= 32 && c <= 127) printf("%c", c);
735                 else printf("?");
736         }
737
738         fflush(stdout);
739 }
740
741
742 /* utf_sprint ******************************************************************
743         
744    Write utf symbol into c-string (for debugging purposes).
745
746 *******************************************************************************/
747
748 void utf_sprint(char *buffer, utf *u)
749 {
750         char *endpos;                       /* points behind utf string           */
751         char *utf_ptr;                      /* current position in utf text       */
752         u2 pos = 0;                         /* position in c-string               */
753
754         if (!u) {
755                 memcpy(buffer, "NULL", 5);      /* 4 chars + terminating \0           */
756                 return;
757         }
758
759         endpos = utf_end(u);
760         utf_ptr = u->text;
761
762         while (utf_ptr < endpos) 
763                 /* copy next unicode character */       
764                 buffer[pos++] = utf_nextu2(&utf_ptr);
765
766         /* terminate string */
767         buffer[pos] = '\0';
768 }
769
770
771 /* utf_sprint_classname ********************************************************
772         
773    Write utf symbol into c-string with `/' converted to `.' (for debugging
774    purposes).
775
776 *******************************************************************************/
777
778 void utf_sprint_classname(char *buffer, utf *u)
779 {
780         char *endpos;                       /* points behind utf string           */
781         char *utf_ptr;                      /* current position in utf text       */
782         u2 pos = 0;                         /* position in c-string               */
783
784         if (!u) {
785                 memcpy(buffer, "NULL", 5);      /* 4 chars + terminating \0           */
786                 return;
787         }
788
789         endpos = utf_end(u);
790         utf_ptr = u->text;
791
792         while (utf_ptr < endpos) {
793                 /* copy next unicode character */       
794                 u2 c = utf_nextu2(&utf_ptr);
795                 if (c == '/') c = '.';
796                 buffer[pos++] = c;
797         }
798
799         /* terminate string */
800         buffer[pos] = '\0';
801 }
802
803
804 /* utf_fprint ******************************************************************
805         
806    Write utf symbol into file.
807
808 *******************************************************************************/
809
810 void utf_fprint(FILE *file, utf *u)
811 {
812         char *endpos;                       /* points behind utf string           */
813         char *utf_ptr;                      /* current position in utf text       */
814
815         if (!u)
816                 return;
817
818         endpos = utf_end(u);
819         utf_ptr = u->text;
820
821         while (utf_ptr < endpos) { 
822                 /* read next unicode character */                
823                 u2 c = utf_nextu2(&utf_ptr);                            
824
825                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
826                 else fprintf(file, "?");
827         }
828 }
829
830
831 /* utf_fprint_classname ********************************************************
832         
833    Write utf symbol into file with `/' converted to `.'.
834
835 *******************************************************************************/
836
837 void utf_fprint_classname(FILE *file, utf *u)
838 {
839         char *endpos;                       /* points behind utf string           */
840         char *utf_ptr;                      /* current position in utf text       */
841
842     if (!u)
843                 return;
844
845         endpos = utf_end(u);
846         utf_ptr = u->text;
847
848         while (utf_ptr < endpos) { 
849                 /* read next unicode character */                
850                 u2 c = utf_nextu2(&utf_ptr);                            
851                 if (c == '/') c = '.';
852
853                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
854                 else fprintf(file, "?");
855         }
856 }
857
858
859 /* is_valid_utf ****************************************************************
860
861    Return true if the given string is a valid UTF-8 string.
862
863    utf_ptr...points to first character
864    end_pos...points after last character
865
866 *******************************************************************************/
867
868 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
869
870 bool is_valid_utf(char *utf_ptr, char *end_pos)
871 {
872         int bytes;
873         int len,i;
874         char c;
875         unsigned long v;
876
877         if (end_pos < utf_ptr) return false;
878         bytes = end_pos - utf_ptr;
879         while (bytes--) {
880                 c = *utf_ptr++;
881
882                 if (!c) return false;                     /* 0x00 is not allowed */
883                 if ((c & 0x80) == 0) continue;            /* ASCII */
884
885                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
886                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
887                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
888                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
889                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
890                 else return false;                        /* invalid leading byte */
891
892                 if (len > 2) return false;                /* Java limitation */
893
894                 v = (unsigned long)c & (0x3f >> len);
895                 
896                 if ((bytes -= len) < 0) return false;     /* missing bytes */
897
898                 for (i = len; i--; ) {
899                         c = *utf_ptr++;
900                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
901                                 return false;
902                         v = (v << 6) | (c & 0x3f);
903                 }
904
905                 if (v == 0) {
906                         if (len != 1) return false;           /* Java special */
907
908                 } else {
909                         /* Sun Java seems to allow overlong UTF-8 encodings */
910                         
911                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
912                                 if (!opt_liberalutf)
913                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
914                                 /* XXX change this to panic? */
915                         }
916                 }
917
918                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
919                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
920
921                 /* even these seem to be allowed */
922                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
923         }
924
925         return true;
926 }
927
928
929 /* is_valid_name ***************************************************************
930
931    Return true if the given string may be used as a class/field/method
932    name. (Currently this only disallows empty strings and control
933    characters.)
934
935    NOTE: The string is assumed to have passed is_valid_utf!
936
937    utf_ptr...points to first character
938    end_pos...points after last character
939
940 *******************************************************************************/
941
942 bool is_valid_name(char *utf_ptr, char *end_pos)
943 {
944         if (end_pos <= utf_ptr) return false; /* disallow empty names */
945
946         while (utf_ptr < end_pos) {
947                 unsigned char c = *utf_ptr++;
948
949                 if (c < 0x20) return false; /* disallow control characters */
950                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
951                         return false;
952         }
953
954         return true;
955 }
956
957 bool is_valid_name_utf(utf *u)
958 {
959         return is_valid_name(u->text,utf_end(u));
960 }
961
962
963 /* utf_show ********************************************************************
964
965    Writes the utf symbols in the utfhash to stdout and displays the
966    number of external hash chains grouped according to the chainlength
967    (for debugging purposes).
968
969 *******************************************************************************/
970
971 void utf_show(void)
972 {
973
974 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
975
976         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
977         u4 max_chainlength = 0;      /* maximum length of the chains */
978         u4 sum_chainlength = 0;      /* sum of the chainlengths */
979         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
980         u4 i;
981
982         printf ("UTF-HASH:\n");
983
984         /* show element of utf-hashtable */
985         for (i=0; i<utf_hash.size; i++) {
986                 utf *u = utf_hash.ptr[i];
987                 if (u) {
988                         printf ("SLOT %d: ", (int) i);
989                         while (u) {
990                                 printf ("'");
991                                 utf_display (u);
992                                 printf ("' ");
993                                 u = u->hashlink;
994                         }       
995                         printf ("\n");
996                 }
997                 
998         }
999
1000         printf ("UTF-HASH: %d slots for %d entries\n", 
1001                         (int) utf_hash.size, (int) utf_hash.entries );
1002
1003
1004         if (utf_hash.entries == 0)
1005                 return;
1006
1007         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1008
1009         for (i=0;i<CHAIN_LIMIT;i++)
1010                 chain_count[i]=0;
1011
1012         /* count numbers of hashchains according to their length */
1013         for (i=0; i<utf_hash.size; i++) {
1014                   
1015                 utf *u = (utf*) utf_hash.ptr[i];
1016                 u4 chain_length = 0;
1017
1018                 /* determine chainlength */
1019                 while (u) {
1020                         u = u->hashlink;
1021                         chain_length++;
1022                 }
1023
1024                 /* update sum of all chainlengths */
1025                 sum_chainlength+=chain_length;
1026
1027                 /* determine the maximum length of the chains */
1028                 if (chain_length>max_chainlength)
1029                         max_chainlength = chain_length;
1030
1031                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1032                 if (chain_length>=CHAIN_LIMIT) {
1033                         beyond_limit+=chain_length;
1034                         chain_length=CHAIN_LIMIT-1;
1035                 }
1036
1037                 /* update number of hashchains of current length */
1038                 chain_count[chain_length]++;
1039         }
1040
1041         /* display results */  
1042         for (i=1;i<CHAIN_LIMIT-1;i++) 
1043                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1044           
1045         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1046
1047
1048         printf("max. chainlength:%5d\n",max_chainlength);
1049
1050         /* avg. chainlength = sum of chainlengths / number of chains */
1051         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1052 }
1053
1054
1055 /*
1056  * These are local overrides for various environment variables in Emacs.
1057  * Please do not remove this and leave it at the end of the file, where
1058  * Emacs will automagically detect them.
1059  * ---------------------------------------------------------------------
1060  * Local variables:
1061  * mode: c
1062  * indent-tabs-mode: t
1063  * c-basic-offset: 4
1064  * tab-width: 4
1065  * End:
1066  */