Use proper class load and link functions.
[cacao.git] / src / vm / utf8.c
1 /* src/vm/utf.c - utf functions
2
3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
6    Institut f. Computersprachen - TU Wien
7
8    This file is part of CACAO.
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License as
12    published by the Free Software Foundation; either version 2, or (at
13    your option) any later version.
14
15    This program is distributed in the hope that it will be useful, but
16    WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23    02111-1307, USA.
24
25    Contact: cacao@complang.tuwien.ac.at
26
27    Authors: Reinhard Grafl
28
29    Changes: Mark Probst
30             Andreas Krall
31             Christian Thalinger
32
33    $Id: utf8.c 2148 2005-03-30 16:49:40Z twisti $
34
35 */
36
37
38 #include <string.h>
39
40 #include "mm/memory.h"
41 #include "vm/exceptions.h"
42 #include "vm/options.h"
43 #include "vm/statistics.h"
44 #include "vm/tables.h"
45 #include "vm/utf8.h"
46
47
48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
49
50
51 /* utf-symbols for pointer comparison of frequently used strings **************/
52
53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
54
55 utf *utf_java_lang_Class;
56 utf *utf_java_lang_ClassLoader;
57 utf *utf_java_lang_Cloneable;
58 utf *utf_java_lang_SecurityManager;
59 utf *utf_java_lang_String;
60 utf *utf_java_lang_System;
61 utf *utf_java_io_Serializable;
62
63 utf *utf_java_lang_Throwable;
64 utf *utf_java_lang_VMThrowable;
65 utf *utf_java_lang_Exception;
66 utf *utf_java_lang_Error;
67 utf *utf_java_lang_OutOfMemoryError;
68
69 utf* utf_java_lang_Void;
70 utf* utf_java_lang_Boolean;
71 utf* utf_java_lang_Byte;
72 utf* utf_java_lang_Character;
73 utf* utf_java_lang_Short;
74 utf* utf_java_lang_Integer;
75 utf* utf_java_lang_Long;
76 utf* utf_java_lang_Float;
77 utf* utf_java_lang_Double;
78
79 utf *utf_java_util_Vector;
80
81 utf *utf_InnerClasses;                  /* InnerClasses                       */
82 utf *utf_ConstantValue;                 /* ConstantValue                      */
83 utf *utf_Code;                          /* Code                               */
84 utf *utf_Exceptions;                    /* Exceptions                         */
85 utf *utf_LineNumberTable;               /* LineNumberTable                    */
86 utf *utf_SourceFile;                    /* SourceFile                         */
87
88 utf *utf_init;                          /* <init>                             */
89 utf *utf_clinit;                        /* <clinit>                           */
90 utf *utf_finalize;                      /* finalize                           */
91
92 utf *utf_printStackTrace;
93 utf *utf_fillInStackTrace;
94 utf *utf_loadClass;
95
96 utf *utf_void__void;                    /* ()V                                */
97 utf *utf_boolean__void;                 /* (Z)V                               */
98 utf *utf_byte__void;                    /* (B)V                               */
99 utf *utf_char__void;                    /* (C)V                               */
100 utf *utf_short__void;                   /* (S)V                               */
101 utf *utf_int__void;                     /* (I)V                               */
102 utf *utf_long__void;                    /* (J)V                               */
103 utf *utf_float__void;                   /* (F)V                               */
104 utf *utf_double__void;                  /* (D)V                               */
105 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
106 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
107 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
108 utf *utf_java_lang_String__java_lang_Class;
109 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
110
111 utf *array_packagename;
112
113
114 /* utf_init ********************************************************************
115
116    Initializes the utf8 subsystem.
117
118 *******************************************************************************/
119
120 void utf8_init(void)
121 {
122         /* create utf-symbols for pointer comparison of frequently used strings */
123
124         utf_java_lang_Object           = utf_new_char("java/lang/Object");
125
126         utf_java_lang_Class            = utf_new_char("java/lang/Class");
127         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
128         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
129         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
130         utf_java_lang_String           = utf_new_char("java/lang/String");
131         utf_java_lang_System           = utf_new_char("java/lang/System");
132         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
133
134         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
135         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
136         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
137         utf_java_lang_Error            = utf_new_char("java/lang/Error");
138         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
139
140         utf_java_lang_Void             = utf_new_char("java/lang/Void");
141         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
142         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
143         utf_java_lang_Character        = utf_new_char("java/lang/Character");
144         utf_java_lang_Short            = utf_new_char("java/lang/Short");
145         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
146         utf_java_lang_Long             = utf_new_char("java/lang/Long");
147         utf_java_lang_Float            = utf_new_char("java/lang/Float");
148         utf_java_lang_Double           = utf_new_char("java/lang/Double");
149
150         utf_java_util_Vector           = utf_new_char("java/util/Vector");
151
152         utf_InnerClasses               = utf_new_char("InnerClasses");
153         utf_ConstantValue              = utf_new_char("ConstantValue");
154         utf_Code                       = utf_new_char("Code");
155         utf_Exceptions                 = utf_new_char("Exceptions");
156         utf_LineNumberTable            = utf_new_char("LineNumberTable");
157         utf_SourceFile                 = utf_new_char("SourceFile");
158
159         utf_init                           = utf_new_char("<init>");
160         utf_clinit                         = utf_new_char("<clinit>");
161         utf_finalize                   = utf_new_char("finalize");
162
163         utf_printStackTrace            = utf_new_char("printStackTrace");
164         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
165         utf_loadClass                  = utf_new_char("loadClass");
166
167         utf_void__void                 = utf_new_char("()V");
168         utf_boolean__void              = utf_new_char("(Z)V");
169         utf_byte__void                 = utf_new_char("(B)V");
170         utf_char__void                 = utf_new_char("(C)V");
171         utf_short__void                = utf_new_char("(S)V");
172         utf_int__void                  = utf_new_char("(I)V");
173         utf_long__void                 = utf_new_char("(J)V");
174         utf_float__void                = utf_new_char("(F)V");
175         utf_double__void               = utf_new_char("(D)V");
176         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
177         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
178         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
179
180         utf_java_lang_String__java_lang_Class =
181                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
182
183         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
184
185         array_packagename              = utf_new_char("\t<the array package>");
186 }
187
188
189 /* utf_hashkey *****************************************************************
190
191    The hashkey is computed from the utf-text by using up to 8
192    characters.  For utf-symbols longer than 15 characters 3 characters
193    are taken from the beginning and the end, 2 characters are taken
194    from the middle.
195
196 *******************************************************************************/
197
198 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
199 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
200
201 u4 utf_hashkey(const char *text, u4 length)
202 {
203         const char *start_pos = text;       /* pointer to utf text                */
204         u4 a;
205
206         switch (length) {
207         case 0: /* empty string */
208                 return 0;
209
210         case 1: return fbs(0);
211         case 2: return fbs(0) ^ nbs(3);
212         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
213         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
214         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
215         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
216         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
217         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
218
219         case 9:
220                 a = fbs(0);
221                 a ^= nbs(1);
222                 a ^= nbs(2);
223                 text++;
224                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
225
226         case 10:
227                 a = fbs(0);
228                 text++;
229                 a ^= nbs(2);
230                 a ^= nbs(3);
231                 a ^= nbs(4);
232                 text++;
233                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
234
235         case 11:
236                 a = fbs(0);
237                 text++;
238                 a ^= nbs(2);
239                 a ^= nbs(3);
240                 a ^= nbs(4);
241                 text++;
242                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
243
244         case 12:
245                 a = fbs(0);
246                 text += 2;
247                 a ^= nbs(2);
248                 a ^= nbs(3);
249                 text++;
250                 a ^= nbs(5);
251                 a ^= nbs(6);
252                 a ^= nbs(7);
253                 text++;
254                 return a ^ nbs(9) ^ nbs(10);
255
256         case 13:
257                 a = fbs(0);
258                 a ^= nbs(1);
259                 text++;
260                 a ^= nbs(3);
261                 a ^= nbs(4);
262                 text += 2;      
263                 a ^= nbs(7);
264                 a ^= nbs(8);
265                 text += 2;
266                 return a ^ nbs(9) ^ nbs(10);
267
268         case 14:
269                 a = fbs(0);
270                 text += 2;      
271                 a ^= nbs(3);
272                 a ^= nbs(4);
273                 text += 2;      
274                 a ^= nbs(7);
275                 a ^= nbs(8);
276                 text += 2;
277                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
278
279         case 15:
280                 a = fbs(0);
281                 text += 2;      
282                 a ^= nbs(3);
283                 a ^= nbs(4);
284                 text += 2;      
285                 a ^= nbs(7);
286                 a ^= nbs(8);
287                 text += 2;
288                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
289
290         default:  /* 3 characters from beginning */
291                 a = fbs(0);
292                 text += 2;
293                 a ^= nbs(3);
294                 a ^= nbs(4);
295
296                 /* 2 characters from middle */
297                 text = start_pos + (length / 2);
298                 a ^= fbs(5);
299                 text += 2;
300                 a ^= nbs(6);    
301
302                 /* 3 characters from end */
303                 text = start_pos + length - 4;
304
305                 a ^= fbs(7);
306                 text++;
307
308                 return a ^ nbs(10) ^ nbs(11);
309     }
310 }
311
312
313 /* utf_hashkey *****************************************************************
314
315    Compute the hashkey of a unicode string.
316
317 *******************************************************************************/
318
319 u4 unicode_hashkey(u2 *text, u2 len)
320 {
321         return utf_hashkey((char *) text, len);
322 }
323
324
325 /* utf_new *********************************************************************
326
327    Creates a new utf-symbol, the text of the symbol is passed as a
328    u1-array. The function searches the utf-hashtable for a utf-symbol
329    with this text. On success the element returned, otherwise a new
330    hashtable element is created.
331
332    If the number of entries in the hashtable exceeds twice the size of
333    the hashtable slots a reorganization of the hashtable is done and
334    the utf symbols are copied to a new hashtable with doubled size.
335
336 *******************************************************************************/
337
338 utf *utf_new_intern(const char *text, u2 length);
339
340 utf *utf_new(const char *text, u2 length)
341 {
342     utf *r;
343
344 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
345     tables_lock();
346 #endif
347
348     r = utf_new_intern(text, length);
349
350 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
351     tables_unlock();
352 #endif
353
354     return r;
355 }
356
357
358 utf *utf_new_intern(const char *text, u2 length)
359 {
360         u4 key;                             /* hashkey computed from utf-text     */
361         u4 slot;                            /* slot in hashtable                  */
362         utf *u;                             /* hashtable element                  */
363         u2 i;
364
365 #ifdef STATISTICS
366         if (opt_stat)
367                 count_utf_new++;
368 #endif
369
370         key  = utf_hashkey(text, length);
371         slot = key & (utf_hash.size - 1);
372         u    = utf_hash.ptr[slot];
373
374         /* search external hash chain for utf-symbol */
375         while (u) {
376                 if (u->blength == length) {
377
378                         /* compare text of hashtable elements */
379                         for (i = 0; i < length; i++)
380                                 if (text[i] != u->text[i]) goto nomatch;
381                         
382 #ifdef STATISTICS
383                         if (opt_stat)
384                                 count_utf_new_found++;
385 #endif
386
387                         /* symbol found in hashtable */
388                         return u;
389                 }
390         nomatch:
391                 u = u->hashlink; /* next element in external chain */
392         }
393
394 #ifdef STATISTICS
395         if (opt_stat)
396                 count_utf_len += sizeof(utf) + length;
397 #endif
398
399         /* location in hashtable found, create new utf element */
400         u = NEW(utf);
401         u->blength  = length;               /* length in bytes of utfstring       */
402         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
403         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
404         memcpy(u->text, text, length);      /* copy utf-text                      */
405         u->text[length] = '\0';
406         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
407
408         utf_hash.entries++;                 /* update number of entries           */
409
410         if (utf_hash.entries > (utf_hash.size * 2)) {
411
412         /* reorganization of hashtable, average length of 
413            the external chains is approx. 2                */  
414
415                 u4 i;
416                 utf *u;
417                 hashtable newhash; /* the new hashtable */
418
419                 /* create new hashtable, double the size */
420                 init_hashtable(&newhash, utf_hash.size * 2);
421                 newhash.entries = utf_hash.entries;
422
423 #ifdef STATISTICS
424                 if (opt_stat)
425                         count_utf_len += sizeof(utf*) * utf_hash.size;
426 #endif
427
428                 /* transfer elements to new hashtable */
429                 for (i = 0; i < utf_hash.size; i++) {
430                         u = (utf *) utf_hash.ptr[i];
431                         while (u) {
432                                 utf *nextu = u->hashlink;
433                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
434                                                 
435                                 u->hashlink = (utf *) newhash.ptr[slot];
436                                 newhash.ptr[slot] = u;
437
438                                 /* follow link in external hash chain */
439                                 u = nextu;
440                         }
441                 }
442         
443                 /* dispose old table */
444                 MFREE(utf_hash.ptr, void*, utf_hash.size);
445                 utf_hash = newhash;
446         }
447
448         return u;
449 }
450
451
452 /* utf_new_u2 ******************************************************************
453
454    Make utf symbol from u2 array, if isclassname is true '.' is
455    replaced by '/'.
456
457 *******************************************************************************/
458
459 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
460 {
461         char *buffer;                   /* memory buffer for  unicode characters  */
462         char *pos;                      /* pointer to current position in buffer  */
463         u4 left;                        /* unicode characters left                */
464         u4 buflength;                   /* utf length in bytes of the u2 array    */
465         utf *result;                    /* resulting utf-string                   */
466         int i;          
467
468         /* determine utf length in bytes and allocate memory */
469
470         buflength = u2_utflength(unicode_pos, unicode_length); 
471         buffer    = MNEW(char, buflength);
472  
473         left = buflength;
474         pos  = buffer;
475
476         for (i = 0; i++ < unicode_length; unicode_pos++) {
477                 /* next unicode character */
478                 u2 c = *unicode_pos;
479                 
480                 if ((c != 0) && (c < 0x80)) {
481                         /* 1 character */       
482                         left--;
483                 if ((int) left < 0) break;
484                         /* convert classname */
485                         if (isclassname && c == '.')
486                                 *pos++ = '/';
487                         else
488                                 *pos++ = (char) c;
489
490                 } else if (c < 0x800) {             
491                         /* 2 characters */                              
492                 unsigned char high = c >> 6;
493                 unsigned char low  = c & 0x3F;
494                         left = left - 2;
495                 if ((int) left < 0) break;
496                 *pos++ = high | 0xC0; 
497                 *pos++ = low  | 0x80;     
498
499                 } else {         
500                 /* 3 characters */                              
501                 char low  = c & 0x3f;
502                 char mid  = (c >> 6) & 0x3F;
503                 char high = c >> 12;
504                         left = left - 3;
505                 if ((int) left < 0) break;
506                 *pos++ = high | 0xE0; 
507                 *pos++ = mid  | 0x80;  
508                 *pos++ = low  | 0x80;   
509                 }
510         }
511         
512         /* insert utf-string into symbol-table */
513         result = utf_new(buffer,buflength);
514
515         MFREE(buffer, char, buflength);
516
517         return result;
518 }
519
520
521 /* utf_new_char ****************************************************************
522
523    Creates a new utf symbol, the text for this symbol is passed as a
524    c-string ( = char* ).
525
526 *******************************************************************************/
527
528 utf *utf_new_char(const char *text)
529 {
530         return utf_new(text, strlen(text));
531 }
532
533
534 /* utf_new_char_classname ******************************************************
535
536    Creates a new utf symbol, the text for this symbol is passed as a
537    c-string ( = char* ) "." characters are going to be replaced by
538    "/". Since the above function is used often, this is a separte
539    function, instead of an if.
540
541 *******************************************************************************/
542
543 utf *utf_new_char_classname(const char *text)
544 {
545         if (strchr(text, '.')) {
546                 char *txt = strdup(text);
547                 char *end = txt + strlen(txt);
548                 char *c;
549                 utf *tmpRes;
550
551                 for (c = txt; c < end; c++)
552                         if (*c == '.') *c = '/';
553
554                 tmpRes = utf_new(txt, strlen(txt));
555                 FREE(txt, 0);
556
557                 return tmpRes;
558
559         } else
560                 return utf_new(text, strlen(text));
561 }
562
563
564 /* utf_nextu2 ******************************************************************
565
566    Read the next unicode character from the utf string and increment
567    the utf-string pointer accordingly.
568
569 *******************************************************************************/
570
571 u2 utf_nextu2(char **utf_ptr)
572 {
573     /* uncompressed unicode character */
574     u2 unicode_char = 0;
575     /* current position in utf text */  
576     unsigned char *utf = (unsigned char *) (*utf_ptr);
577     /* bytes representing the unicode character */
578     unsigned char ch1, ch2, ch3;
579     /* number of bytes used to represent the unicode character */
580     int len = 0;
581         
582     switch ((ch1 = utf[0]) >> 4) {
583         default: /* 1 byte */
584                 (*utf_ptr)++;
585                 return (u2) ch1;
586         case 0xC: 
587         case 0xD: /* 2 bytes */
588                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
589                         unsigned char high = ch1 & 0x1F;
590                         unsigned char low  = ch2 & 0x3F;
591                         unicode_char = (high << 6) + low;
592                         len = 2;
593                 }
594                 break;
595
596         case 0xE: /* 2 or 3 bytes */
597                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
598                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
599                                 unsigned char low  = ch3 & 0x3f;
600                                 unsigned char mid  = ch2 & 0x3f;
601                                 unsigned char high = ch1 & 0x0f;
602                                 unicode_char = (((high << 6) + mid) << 6) + low;
603                                 len = 3;
604                         } else
605                                 len = 2;                                           
606                 }
607                 break;
608     }
609
610     /* update position in utf-text */
611     *utf_ptr = (char *) (utf + len);
612
613     return unicode_char;
614 }
615
616
617 /* utf_strlen ******************************************************************
618
619    Determine number of unicode characters in the utf string.
620
621 *******************************************************************************/
622
623 u4 utf_strlen(utf *u)
624 {
625         char *endpos;                       /* points behind utf string           */
626         char *utf_ptr;                      /* current position in utf text       */
627         u4 len = 0;                         /* number of unicode characters       */
628
629         if (!u) {
630                 *exceptionptr = new_nullpointerexception();
631                 return 0;
632         }
633
634         endpos = utf_end(u);
635         utf_ptr = u->text;
636
637         while (utf_ptr < endpos) {
638                 len++;
639                 /* next unicode character */
640                 utf_nextu2(&utf_ptr);
641         }
642
643         if (utf_ptr != endpos)
644                 /* string ended abruptly */
645                 throw_cacao_exception_exit(string_java_lang_InternalError,
646                                                                    "Illegal utf8 string");
647
648         return len;
649 }
650
651
652 /* u2_utflength ****************************************************************
653
654    Returns the utf length in bytes of a u2 array.
655
656 *******************************************************************************/
657
658 u4 u2_utflength(u2 *text, u4 u2_length)
659 {
660         u4 result_len = 0;                  /* utf length in bytes                */
661         u2 ch;                              /* current unicode character          */
662         u4 len;
663         
664         for (len = 0; len < u2_length; len++) {
665                 /* next unicode character */
666                 ch = *text++;
667           
668                 /* determine bytes required to store unicode character as utf */
669                 if (ch && (ch < 0x80)) 
670                         result_len++;
671                 else if (ch < 0x800)
672                         result_len += 2;        
673                 else 
674                         result_len += 3;        
675         }
676
677     return result_len;
678 }
679
680
681 /* utf_display *****************************************************************
682
683    Write utf symbol to stdout (for debugging purposes).
684
685 *******************************************************************************/
686
687 void utf_display(utf *u)
688 {
689         char *endpos;                       /* points behind utf string           */
690         char *utf_ptr;                      /* current position in utf text       */
691
692         if (!u) {
693                 printf("NULL");
694                 fflush(stdout);
695                 return;
696         }
697
698         endpos = utf_end(u);
699         utf_ptr = u->text;
700
701         while (utf_ptr < endpos) {
702                 /* read next unicode character */                
703                 u2 c = utf_nextu2(&utf_ptr);
704                 if (c >= 32 && c <= 127) printf("%c", c);
705                 else printf("?");
706         }
707
708         fflush(stdout);
709 }
710
711
712 /* utf_display_classname *******************************************************
713
714    Write utf symbol to stdout with `/' converted to `.' (for debugging
715    purposes).
716
717 *******************************************************************************/
718
719 void utf_display_classname(utf *u)
720 {
721         char *endpos;                       /* points behind utf string           */
722         char *utf_ptr;                      /* current position in utf text       */
723
724         if (!u) {
725                 printf("NULL");
726                 fflush(stdout);
727                 return;
728         }
729
730         endpos = utf_end(u);
731         utf_ptr = u->text;
732
733         while (utf_ptr < endpos) {
734                 /* read next unicode character */                
735                 u2 c = utf_nextu2(&utf_ptr);
736                 if (c == '/') c = '.';
737                 if (c >= 32 && c <= 127) printf("%c", c);
738                 else printf("?");
739         }
740
741         fflush(stdout);
742 }
743
744
745 /* utf_sprint ******************************************************************
746         
747    Write utf symbol into c-string (for debugging purposes).
748
749 *******************************************************************************/
750
751 void utf_sprint(char *buffer, utf *u)
752 {
753         char *endpos;                       /* points behind utf string           */
754         char *utf_ptr;                      /* current position in utf text       */
755         u2 pos = 0;                         /* position in c-string               */
756
757         if (!u) {
758                 strcpy(buffer, "NULL");
759                 return;
760         }
761
762         endpos = utf_end(u);
763         utf_ptr = u->text;
764
765         while (utf_ptr < endpos) 
766                 /* copy next unicode character */       
767                 buffer[pos++] = utf_nextu2(&utf_ptr);
768
769         /* terminate string */
770         buffer[pos] = '\0';
771 }
772
773
774 /* utf_sprint_classname ********************************************************
775         
776    Write utf symbol into c-string with `/' converted to `.' (for debugging
777    purposes).
778
779 *******************************************************************************/
780
781 void utf_sprint_classname(char *buffer, utf *u)
782 {
783         char *endpos;                       /* points behind utf string           */
784         char *utf_ptr;                      /* current position in utf text       */
785         u2 pos = 0;                         /* position in c-string               */
786
787         if (!u) {
788                 strcpy(buffer, "NULL");
789                 return;
790         }
791
792         endpos = utf_end(u);
793         utf_ptr = u->text;
794
795         while (utf_ptr < endpos) {
796                 /* copy next unicode character */       
797                 u2 c = utf_nextu2(&utf_ptr);
798                 if (c == '/') c = '.';
799                 buffer[pos++] = c;
800         }
801
802         /* terminate string */
803         buffer[pos] = '\0';
804 }
805
806
807 /* utf_strcat ******************************************************************
808         
809    Like libc strcat, but uses an utf8 string.
810
811 *******************************************************************************/
812
813 void utf_strcat(char *buffer, utf *u)
814 {
815         utf_sprint(buffer + strlen(buffer), u);
816 }
817
818
819 /* utf_strcat_classname ********************************************************
820         
821    Like libc strcat, but uses an utf8 string.
822
823 *******************************************************************************/
824
825 void utf_strcat_classname(char *buffer, utf *u)
826 {
827         utf_sprint_classname(buffer + strlen(buffer), u);
828 }
829
830
831 /* utf_fprint ******************************************************************
832         
833    Write utf symbol into file.
834
835 *******************************************************************************/
836
837 void utf_fprint(FILE *file, utf *u)
838 {
839         char *endpos;                       /* points behind utf string           */
840         char *utf_ptr;                      /* current position in utf text       */
841
842         if (!u)
843                 return;
844
845         endpos = utf_end(u);
846         utf_ptr = u->text;
847
848         while (utf_ptr < endpos) { 
849                 /* read next unicode character */                
850                 u2 c = utf_nextu2(&utf_ptr);                            
851
852                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
853                 else fprintf(file, "?");
854         }
855 }
856
857
858 /* utf_fprint_classname ********************************************************
859         
860    Write utf symbol into file with `/' converted to `.'.
861
862 *******************************************************************************/
863
864 void utf_fprint_classname(FILE *file, utf *u)
865 {
866         char *endpos;                       /* points behind utf string           */
867         char *utf_ptr;                      /* current position in utf text       */
868
869     if (!u)
870                 return;
871
872         endpos = utf_end(u);
873         utf_ptr = u->text;
874
875         while (utf_ptr < endpos) { 
876                 /* read next unicode character */                
877                 u2 c = utf_nextu2(&utf_ptr);                            
878                 if (c == '/') c = '.';
879
880                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
881                 else fprintf(file, "?");
882         }
883 }
884
885
886 /* is_valid_utf ****************************************************************
887
888    Return true if the given string is a valid UTF-8 string.
889
890    utf_ptr...points to first character
891    end_pos...points after last character
892
893 *******************************************************************************/
894
895 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
896
897 bool is_valid_utf(char *utf_ptr, char *end_pos)
898 {
899         int bytes;
900         int len,i;
901         char c;
902         unsigned long v;
903
904         if (end_pos < utf_ptr) return false;
905         bytes = end_pos - utf_ptr;
906         while (bytes--) {
907                 c = *utf_ptr++;
908
909                 if (!c) return false;                     /* 0x00 is not allowed */
910                 if ((c & 0x80) == 0) continue;            /* ASCII */
911
912                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
913                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
914                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
915                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
916                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
917                 else return false;                        /* invalid leading byte */
918
919                 if (len > 2) return false;                /* Java limitation */
920
921                 v = (unsigned long)c & (0x3f >> len);
922                 
923                 if ((bytes -= len) < 0) return false;     /* missing bytes */
924
925                 for (i = len; i--; ) {
926                         c = *utf_ptr++;
927                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
928                                 return false;
929                         v = (v << 6) | (c & 0x3f);
930                 }
931
932                 if (v == 0) {
933                         if (len != 1) return false;           /* Java special */
934
935                 } else {
936                         /* Sun Java seems to allow overlong UTF-8 encodings */
937                         
938                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
939                                 if (!opt_liberalutf)
940                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
941                                 /* XXX change this to panic? */
942                         }
943                 }
944
945                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
946                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
947
948                 /* even these seem to be allowed */
949                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
950         }
951
952         return true;
953 }
954
955
956 /* is_valid_name ***************************************************************
957
958    Return true if the given string may be used as a class/field/method
959    name. (Currently this only disallows empty strings and control
960    characters.)
961
962    NOTE: The string is assumed to have passed is_valid_utf!
963
964    utf_ptr...points to first character
965    end_pos...points after last character
966
967 *******************************************************************************/
968
969 bool is_valid_name(char *utf_ptr, char *end_pos)
970 {
971         if (end_pos <= utf_ptr) return false; /* disallow empty names */
972
973         while (utf_ptr < end_pos) {
974                 unsigned char c = *utf_ptr++;
975
976                 if (c < 0x20) return false; /* disallow control characters */
977                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
978                         return false;
979         }
980
981         return true;
982 }
983
984 bool is_valid_name_utf(utf *u)
985 {
986         return is_valid_name(u->text,utf_end(u));
987 }
988
989
990 /* utf_show ********************************************************************
991
992    Writes the utf symbols in the utfhash to stdout and displays the
993    number of external hash chains grouped according to the chainlength
994    (for debugging purposes).
995
996 *******************************************************************************/
997
998 void utf_show(void)
999 {
1000
1001 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1002
1003         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1004         u4 max_chainlength = 0;      /* maximum length of the chains */
1005         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1006         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1007         u4 i;
1008
1009         printf ("UTF-HASH:\n");
1010
1011         /* show element of utf-hashtable */
1012         for (i=0; i<utf_hash.size; i++) {
1013                 utf *u = utf_hash.ptr[i];
1014                 if (u) {
1015                         printf ("SLOT %d: ", (int) i);
1016                         while (u) {
1017                                 printf ("'");
1018                                 utf_display (u);
1019                                 printf ("' ");
1020                                 u = u->hashlink;
1021                         }       
1022                         printf ("\n");
1023                 }
1024                 
1025         }
1026
1027         printf ("UTF-HASH: %d slots for %d entries\n", 
1028                         (int) utf_hash.size, (int) utf_hash.entries );
1029
1030
1031         if (utf_hash.entries == 0)
1032                 return;
1033
1034         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1035
1036         for (i=0;i<CHAIN_LIMIT;i++)
1037                 chain_count[i]=0;
1038
1039         /* count numbers of hashchains according to their length */
1040         for (i=0; i<utf_hash.size; i++) {
1041                   
1042                 utf *u = (utf*) utf_hash.ptr[i];
1043                 u4 chain_length = 0;
1044
1045                 /* determine chainlength */
1046                 while (u) {
1047                         u = u->hashlink;
1048                         chain_length++;
1049                 }
1050
1051                 /* update sum of all chainlengths */
1052                 sum_chainlength+=chain_length;
1053
1054                 /* determine the maximum length of the chains */
1055                 if (chain_length>max_chainlength)
1056                         max_chainlength = chain_length;
1057
1058                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1059                 if (chain_length>=CHAIN_LIMIT) {
1060                         beyond_limit+=chain_length;
1061                         chain_length=CHAIN_LIMIT-1;
1062                 }
1063
1064                 /* update number of hashchains of current length */
1065                 chain_count[chain_length]++;
1066         }
1067
1068         /* display results */  
1069         for (i=1;i<CHAIN_LIMIT-1;i++) 
1070                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1071           
1072         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1073
1074
1075         printf("max. chainlength:%5d\n",max_chainlength);
1076
1077         /* avg. chainlength = sum of chainlengths / number of chains */
1078         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1079 }
1080
1081
1082 /*
1083  * These are local overrides for various environment variables in Emacs.
1084  * Please do not remove this and leave it at the end of the file, where
1085  * Emacs will automagically detect them.
1086  * ---------------------------------------------------------------------
1087  * Local variables:
1088  * mode: c
1089  * indent-tabs-mode: t
1090  * c-basic-offset: 4
1091  * tab-width: 4
1092  * End:
1093  */