src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 3262 2005-09-21 20:02:49Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/stringlocal.h"
  45 #include "vm/tables.h"
  46 #include "vm/utf8.h"
  47
  48
  49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  50
  51
  52 /* utf-symbols for pointer comparison of frequently used strings **************/
  53
  54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  55
  56 utf *utf_java_lang_Class;
  57 utf *utf_java_lang_ClassLoader;
  58 utf *utf_java_lang_Cloneable;
  59 utf *utf_java_lang_SecurityManager;
  60 utf *utf_java_lang_String;
  61 utf *utf_java_lang_System;
  62 utf *utf_java_lang_ThreadGroup;
  63 utf *utf_java_io_Serializable;
  64
  65 utf *utf_java_lang_Throwable;
  66 utf *utf_java_lang_VMThrowable;
  67 utf *utf_java_lang_Error;
  68 utf *utf_java_lang_Exception;
  69 utf *utf_java_lang_NoClassDefFoundError;
  70 utf *utf_java_lang_OutOfMemoryError;
  71 utf *utf_java_lang_ClassNotFoundException;
  72
  73 utf* utf_java_lang_Void;
  74 utf* utf_java_lang_Boolean;
  75 utf* utf_java_lang_Byte;
  76 utf* utf_java_lang_Character;
  77 utf* utf_java_lang_Short;
  78 utf* utf_java_lang_Integer;
  79 utf* utf_java_lang_Long;
  80 utf* utf_java_lang_Float;
  81 utf* utf_java_lang_Double;
  82
  83 utf *utf_java_lang_StackTraceElement;
  84 utf *utf_java_lang_reflect_Constructor;
  85 utf *utf_java_lang_reflect_Field;
  86 utf *utf_java_lang_reflect_Method;
  87 utf *utf_java_util_Vector;
  88
  89 utf *utf_InnerClasses;                  /* InnerClasses                       */
  90 utf *utf_ConstantValue;                 /* ConstantValue                      */
  91 utf *utf_Code;                          /* Code                               */
  92 utf *utf_Exceptions;                    /* Exceptions                         */
  93 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  94 utf *utf_SourceFile;                    /* SourceFile                         */
  95
  96 utf *utf_init;                          /* <init>                             */
  97 utf *utf_clinit;                        /* <clinit>                           */
  98 utf *utf_clone;                         /* clone                              */
  99 utf *utf_finalize;                      /* finalize                           */
 100 utf *utf_run;                           /* run                                */
 101
 102 utf *utf_add;                           /* add                                */
 103
 104 utf *utf_fillInStackTrace;
 105 utf *utf_getSystemClassLoader;
 106 utf *utf_loadClass;
 107 utf *utf_printStackTrace;
 108
 109 utf *utf_void__void;                    /* ()V                                */
 110 utf *utf_boolean__void;                 /* (Z)V                               */
 111 utf *utf_byte__void;                    /* (B)V                               */
 112 utf *utf_char__void;                    /* (C)V                               */
 113 utf *utf_short__void;                   /* (S)V                               */
 114 utf *utf_int__void;                     /* (I)V                               */
 115 utf *utf_long__void;                    /* (J)V                               */
 116 utf *utf_float__void;                   /* (F)V                               */
 117 utf *utf_double__void;                  /* (D)V                               */
 118
 119 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 120 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 121 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 122 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 123 utf *utf_java_lang_String__java_lang_Class;
 124 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 125
 126 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 127
 128 utf *array_packagename;
 129
 130
 131 /* utf_init ********************************************************************
 132
 133    Initializes the utf8 subsystem.
 134
 135 *******************************************************************************/
 136
 137 void utf8_init(void)
 138 {
 139         /* create utf-symbols for pointer comparison of frequently used strings */
 140
 141         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 142
 143         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 144         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 145         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 146         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 147         utf_java_lang_String           = utf_new_char("java/lang/String");
 148         utf_java_lang_System           = utf_new_char("java/lang/System");
 149         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 150         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 151
 152         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 153         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 154         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 155         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 156
 157         utf_java_lang_NoClassDefFoundError =
 158                 utf_new_char(string_java_lang_NoClassDefFoundError);
 159
 160         utf_java_lang_OutOfMemoryError =
 161                 utf_new_char(string_java_lang_OutOfMemoryError);
 162
 163         utf_java_lang_ClassNotFoundException =
 164                 utf_new_char(string_java_lang_ClassNotFoundException);
 165
 166         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 167         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 168         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 169         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 170         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 171         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 172         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 173         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 174         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 175
 176         utf_java_lang_StackTraceElement =
 177                 utf_new_char("java/lang/StackTraceElement");
 178
 179         utf_java_lang_reflect_Constructor =
 180                 utf_new_char("java/lang/reflect/Constructor");
 181
 182         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 183         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 184         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 185
 186         utf_InnerClasses               = utf_new_char("InnerClasses");
 187         utf_ConstantValue              = utf_new_char("ConstantValue");
 188         utf_Code                       = utf_new_char("Code");
 189         utf_Exceptions                 = utf_new_char("Exceptions");
 190         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 191         utf_SourceFile                 = utf_new_char("SourceFile");
 192
 193         utf_init                           = utf_new_char("<init>");
 194         utf_clinit                         = utf_new_char("<clinit>");
 195         utf_clone                      = utf_new_char("clone");
 196         utf_finalize                   = utf_new_char("finalize");
 197         utf_run                        = utf_new_char("run");
 198
 199         utf_add                        = utf_new_char("add");
 200
 201         utf_printStackTrace            = utf_new_char("printStackTrace");
 202         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 203         utf_loadClass                  = utf_new_char("loadClass");
 204         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 205
 206         utf_void__void                 = utf_new_char("()V");
 207         utf_boolean__void              = utf_new_char("(Z)V");
 208         utf_byte__void                 = utf_new_char("(B)V");
 209         utf_char__void                 = utf_new_char("(C)V");
 210         utf_short__void                = utf_new_char("(S)V");
 211         utf_int__void                  = utf_new_char("(I)V");
 212         utf_long__void                 = utf_new_char("(J)V");
 213         utf_float__void                = utf_new_char("(F)V");
 214         utf_double__void               = utf_new_char("(D)V");
 215         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 216         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 217
 218         utf_void__java_lang_ClassLoader =
 219                 utf_new_char("()Ljava/lang/ClassLoader;");
 220
 221         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 222
 223         utf_java_lang_String__java_lang_Class =
 224                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 225
 226         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 227
 228         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 229
 230         array_packagename              = utf_new_char("\t<the array package>");
 231 }
 232
 233
 234 /* utf_hashkey *****************************************************************
 235
 236    The hashkey is computed from the utf-text by using up to 8
 237    characters.  For utf-symbols longer than 15 characters 3 characters
 238    are taken from the beginning and the end, 2 characters are taken
 239    from the middle.
 240
 241 *******************************************************************************/
 242
 243 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 244 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 245
 246 u4 utf_hashkey(const char *text, u4 length)
 247 {
 248         const char *start_pos = text;       /* pointer to utf text                */
 249         u4 a;
 250
 251         switch (length) {
 252         case 0: /* empty string */
 253                 return 0;
 254
 255         case 1: return fbs(0);
 256         case 2: return fbs(0) ^ nbs(3);
 257         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 258         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 259         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 260         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 261         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 262         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 263
 264         case 9:
 265                 a = fbs(0);
 266                 a ^= nbs(1);
 267                 a ^= nbs(2);
 268                 text++;
 269                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 270
 271         case 10:
 272                 a = fbs(0);
 273                 text++;
 274                 a ^= nbs(2);
 275                 a ^= nbs(3);
 276                 a ^= nbs(4);
 277                 text++;
 278                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 279
 280         case 11:
 281                 a = fbs(0);
 282                 text++;
 283                 a ^= nbs(2);
 284                 a ^= nbs(3);
 285                 a ^= nbs(4);
 286                 text++;
 287                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 288
 289         case 12:
 290                 a = fbs(0);
 291                 text += 2;
 292                 a ^= nbs(2);
 293                 a ^= nbs(3);
 294                 text++;
 295                 a ^= nbs(5);
 296                 a ^= nbs(6);
 297                 a ^= nbs(7);
 298                 text++;
 299                 return a ^ nbs(9) ^ nbs(10);
 300
 301         case 13:
 302                 a = fbs(0);
 303                 a ^= nbs(1);
 304                 text++;
 305                 a ^= nbs(3);
 306                 a ^= nbs(4);
 307                 text += 2;
 308                 a ^= nbs(7);
 309                 a ^= nbs(8);
 310                 text += 2;
 311                 return a ^ nbs(9) ^ nbs(10);
 312
 313         case 14:
 314                 a = fbs(0);
 315                 text += 2;
 316                 a ^= nbs(3);
 317                 a ^= nbs(4);
 318                 text += 2;
 319                 a ^= nbs(7);
 320                 a ^= nbs(8);
 321                 text += 2;
 322                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 323
 324         case 15:
 325                 a = fbs(0);
 326                 text += 2;
 327                 a ^= nbs(3);
 328                 a ^= nbs(4);
 329                 text += 2;
 330                 a ^= nbs(7);
 331                 a ^= nbs(8);
 332                 text += 2;
 333                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 334
 335         default:  /* 3 characters from beginning */
 336                 a = fbs(0);
 337                 text += 2;
 338                 a ^= nbs(3);
 339                 a ^= nbs(4);
 340
 341                 /* 2 characters from middle */
 342                 text = start_pos + (length / 2);
 343                 a ^= fbs(5);
 344                 text += 2;
 345                 a ^= nbs(6);
 346
 347                 /* 3 characters from end */
 348                 text = start_pos + length - 4;
 349
 350                 a ^= fbs(7);
 351                 text++;
 352
 353                 return a ^ nbs(10) ^ nbs(11);
 354     }
 355 }
 356
 357
 358 /* utf_hashkey *****************************************************************
 359
 360    Compute the hashkey of a unicode string.
 361
 362 *******************************************************************************/
 363
 364 u4 unicode_hashkey(u2 *text, u2 len)
 365 {
 366         return utf_hashkey((char *) text, len);
 367 }
 368
 369
 370 /* utf_new *********************************************************************
 371
 372    Creates a new utf-symbol, the text of the symbol is passed as a
 373    u1-array. The function searches the utf-hashtable for a utf-symbol
 374    with this text. On success the element returned, otherwise a new
 375    hashtable element is created.
 376
 377    If the number of entries in the hashtable exceeds twice the size of
 378    the hashtable slots a reorganization of the hashtable is done and
 379    the utf symbols are copied to a new hashtable with doubled size.
 380
 381 *******************************************************************************/
 382
 383 utf *utf_new_intern(const char *text, u2 length);
 384
 385 utf *utf_new(const char *text, u2 length)
 386 {
 387     utf *r;
 388
 389 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 390     tables_lock();
 391 #endif
 392
 393     r = utf_new_intern(text, length);
 394
 395 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 396     tables_unlock();
 397 #endif
 398
 399     return r;
 400 }
 401
 402
 403 utf *utf_new_intern(const char *text, u2 length)
 404 {
 405         u4 key;                             /* hashkey computed from utf-text     */
 406         u4 slot;                            /* slot in hashtable                  */
 407         utf *u;                             /* hashtable element                  */
 408         u2 i;
 409
 410 #ifdef STATISTICS
 411         if (opt_stat)
 412                 count_utf_new++;
 413 #endif
 414
 415         key  = utf_hashkey(text, length);
 416         slot = key & (utf_hash.size - 1);
 417         u    = utf_hash.ptr[slot];
 418
 419         /* search external hash chain for utf-symbol */
 420         while (u) {
 421                 if (u->blength == length) {
 422
 423                         /* compare text of hashtable elements */
 424                         for (i = 0; i < length; i++)
 425                                 if (text[i] != u->text[i]) goto nomatch;
 426
 427 #ifdef STATISTICS
 428                         if (opt_stat)
 429                                 count_utf_new_found++;
 430 #endif
 431
 432                         /* symbol found in hashtable */
 433                         return u;
 434                 }
 435         nomatch:
 436                 u = u->hashlink; /* next element in external chain */
 437         }
 438
 439 #ifdef STATISTICS
 440         if (opt_stat)
 441                 count_utf_len += sizeof(utf) + length;
 442 #endif
 443
 444         /* location in hashtable found, create new utf element */
 445         u = NEW(utf);
 446         u->blength  = length;               /* length in bytes of utfstring       */
 447         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 448         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 449         memcpy(u->text, text, length);      /* copy utf-text                      */
 450         u->text[length] = '\0';
 451         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 452
 453         utf_hash.entries++;                 /* update number of entries           */
 454
 455         if (utf_hash.entries > (utf_hash.size * 2)) {
 456
 457         /* reorganization of hashtable, average length of
 458            the external chains is approx. 2                */
 459
 460                 u4 i;
 461                 utf *u;
 462                 hashtable newhash; /* the new hashtable */
 463
 464                 /* create new hashtable, double the size */
 465                 init_hashtable(&newhash, utf_hash.size * 2);
 466                 newhash.entries = utf_hash.entries;
 467
 468 #ifdef STATISTICS
 469                 if (opt_stat)
 470                         count_utf_len += sizeof(utf*) * utf_hash.size;
 471 #endif
 472
 473                 /* transfer elements to new hashtable */
 474                 for (i = 0; i < utf_hash.size; i++) {
 475                         u = (utf *) utf_hash.ptr[i];
 476                         while (u) {
 477                                 utf *nextu = u->hashlink;
 478                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 479
 480                                 u->hashlink = (utf *) newhash.ptr[slot];
 481                                 newhash.ptr[slot] = u;
 482
 483                                 /* follow link in external hash chain */
 484                                 u = nextu;
 485                         }
 486                 }
 487
 488                 /* dispose old table */
 489                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 490                 utf_hash = newhash;
 491         }
 492
 493         return u;
 494 }
 495
 496
 497 /* utf_new_u2 ******************************************************************
 498
 499    Make utf symbol from u2 array, if isclassname is true '.' is
 500    replaced by '/'.
 501
 502 *******************************************************************************/
 503
 504 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 505 {
 506         char *buffer;                   /* memory buffer for  unicode characters  */
 507         char *pos;                      /* pointer to current position in buffer  */
 508         u4 left;                        /* unicode characters left                */
 509         u4 buflength;                   /* utf length in bytes of the u2 array    */
 510         utf *result;                    /* resulting utf-string                   */
 511         int i;
 512
 513         /* determine utf length in bytes and allocate memory */
 514
 515         buflength = u2_utflength(unicode_pos, unicode_length);
 516         buffer    = MNEW(char, buflength);
 517
 518         left = buflength;
 519         pos  = buffer;
 520
 521         for (i = 0; i++ < unicode_length; unicode_pos++) {
 522                 /* next unicode character */
 523                 u2 c = *unicode_pos;
 524
 525                 if ((c != 0) && (c < 0x80)) {
 526                         /* 1 character */
 527                         left--;
 528                 if ((int) left < 0) break;
 529                         /* convert classname */
 530                         if (isclassname && c == '.')
 531                                 *pos++ = '/';
 532                         else
 533                                 *pos++ = (char) c;
 534
 535                 } else if (c < 0x800) {
 536                         /* 2 characters */
 537                 unsigned char high = c >> 6;
 538                 unsigned char low  = c & 0x3F;
 539                         left = left - 2;
 540                 if ((int) left < 0) break;
 541                 *pos++ = high | 0xC0;
 542                 *pos++ = low  | 0x80;
 543
 544                 } else {
 545                 /* 3 characters */
 546                 char low  = c & 0x3f;
 547                 char mid  = (c >> 6) & 0x3F;
 548                 char high = c >> 12;
 549                         left = left - 3;
 550                 if ((int) left < 0) break;
 551                 *pos++ = high | 0xE0;
 552                 *pos++ = mid  | 0x80;
 553                 *pos++ = low  | 0x80;
 554                 }
 555         }
 556
 557         /* insert utf-string into symbol-table */
 558         result = utf_new(buffer,buflength);
 559
 560         MFREE(buffer, char, buflength);
 561
 562         return result;
 563 }
 564
 565
 566 /* utf_new_char ****************************************************************
 567
 568    Creates a new utf symbol, the text for this symbol is passed as a
 569    c-string ( = char* ).
 570
 571 *******************************************************************************/
 572
 573 utf *utf_new_char(const char *text)
 574 {
 575         return utf_new(text, strlen(text));
 576 }
 577
 578
 579 /* utf_new_char_classname ******************************************************
 580
 581    Creates a new utf symbol, the text for this symbol is passed as a
 582    c-string ( = char* ) "." characters are going to be replaced by
 583    "/". Since the above function is used often, this is a separte
 584    function, instead of an if.
 585
 586 *******************************************************************************/
 587
 588 utf *utf_new_char_classname(const char *text)
 589 {
 590         if (strchr(text, '.')) {
 591                 char *txt = strdup(text);
 592                 char *end = txt + strlen(txt);
 593                 char *c;
 594                 utf *tmpRes;
 595
 596                 for (c = txt; c < end; c++)
 597                         if (*c == '.') *c = '/';
 598
 599                 tmpRes = utf_new(txt, strlen(txt));
 600                 FREE(txt, 0);
 601
 602                 return tmpRes;
 603
 604         } else
 605                 return utf_new(text, strlen(text));
 606 }
 607
 608
 609 /* utf_nextu2 ******************************************************************
 610
 611    Read the next unicode character from the utf string and increment
 612    the utf-string pointer accordingly.
 613
 614 *******************************************************************************/
 615
 616 u2 utf_nextu2(char **utf_ptr)
 617 {
 618     /* uncompressed unicode character */
 619     u2 unicode_char = 0;
 620     /* current position in utf text */
 621     unsigned char *utf = (unsigned char *) (*utf_ptr);
 622     /* bytes representing the unicode character */
 623     unsigned char ch1, ch2, ch3;
 624     /* number of bytes used to represent the unicode character */
 625     int len = 0;
 626
 627     switch ((ch1 = utf[0]) >> 4) {
 628         default: /* 1 byte */
 629                 (*utf_ptr)++;
 630                 return (u2) ch1;
 631         case 0xC:
 632         case 0xD: /* 2 bytes */
 633                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 634                         unsigned char high = ch1 & 0x1F;
 635                         unsigned char low  = ch2 & 0x3F;
 636                         unicode_char = (high << 6) + low;
 637                         len = 2;
 638                 }
 639                 break;
 640
 641         case 0xE: /* 2 or 3 bytes */
 642                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 643                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 644                                 unsigned char low  = ch3 & 0x3f;
 645                                 unsigned char mid  = ch2 & 0x3f;
 646                                 unsigned char high = ch1 & 0x0f;
 647                                 unicode_char = (((high << 6) + mid) << 6) + low;
 648                                 len = 3;
 649                         } else
 650                                 len = 2;
 651                 }
 652                 break;
 653     }
 654
 655     /* update position in utf-text */
 656     *utf_ptr = (char *) (utf + len);
 657
 658     return unicode_char;
 659 }
 660
 661
 662 /* utf_strlen ******************************************************************
 663
 664    Determine number of unicode characters in the utf string.
 665
 666 *******************************************************************************/
 667
 668 u4 utf_strlen(utf *u)
 669 {
 670         char *endpos;                       /* points behind utf string           */
 671         char *utf_ptr;                      /* current position in utf text       */
 672         u4 len = 0;                         /* number of unicode characters       */
 673
 674         if (!u) {
 675                 *exceptionptr = new_nullpointerexception();
 676                 return 0;
 677         }
 678
 679         endpos = UTF_END(u);
 680         utf_ptr = u->text;
 681
 682         while (utf_ptr < endpos) {
 683                 len++;
 684                 /* next unicode character */
 685                 utf_nextu2(&utf_ptr);
 686         }
 687
 688         if (utf_ptr != endpos)
 689                 /* string ended abruptly */
 690                 throw_cacao_exception_exit(string_java_lang_InternalError,
 691                                                                    "Illegal utf8 string");
 692
 693         return len;
 694 }
 695
 696
 697 /* u2_utflength ****************************************************************
 698
 699    Returns the utf length in bytes of a u2 array.
 700
 701 *******************************************************************************/
 702
 703 u4 u2_utflength(u2 *text, u4 u2_length)
 704 {
 705         u4 result_len = 0;                  /* utf length in bytes                */
 706         u2 ch;                              /* current unicode character          */
 707         u4 len;
 708
 709         for (len = 0; len < u2_length; len++) {
 710                 /* next unicode character */
 711                 ch = *text++;
 712
 713                 /* determine bytes required to store unicode character as utf */
 714                 if (ch && (ch < 0x80))
 715                         result_len++;
 716                 else if (ch < 0x800)
 717                         result_len += 2;
 718                 else
 719                         result_len += 3;
 720         }
 721
 722     return result_len;
 723 }
 724
 725
 726 /* utf_display *****************************************************************
 727
 728    Write utf symbol to stdout (for debugging purposes).
 729
 730 *******************************************************************************/
 731
 732 void utf_display(utf *u)
 733 {
 734         char *endpos;                       /* points behind utf string           */
 735         char *utf_ptr;                      /* current position in utf text       */
 736
 737         if (!u) {
 738                 printf("NULL");
 739                 fflush(stdout);
 740                 return;
 741         }
 742
 743         endpos = UTF_END(u);
 744         utf_ptr = u->text;
 745
 746         while (utf_ptr < endpos) {
 747                 /* read next unicode character */
 748                 u2 c = utf_nextu2(&utf_ptr);
 749                 if (c >= 32 && c <= 127) printf("%c", c);
 750                 else printf("?");
 751         }
 752
 753         fflush(stdout);
 754 }
 755
 756
 757 /* utf_display_classname *******************************************************
 758
 759    Write utf symbol to stdout with `/' converted to `.' (for debugging
 760    purposes).
 761
 762 *******************************************************************************/
 763
 764 void utf_display_classname(utf *u)
 765 {
 766         char *endpos;                       /* points behind utf string           */
 767         char *utf_ptr;                      /* current position in utf text       */
 768
 769         if (!u) {
 770                 printf("NULL");
 771                 fflush(stdout);
 772                 return;
 773         }
 774
 775         endpos = UTF_END(u);
 776         utf_ptr = u->text;
 777
 778         while (utf_ptr < endpos) {
 779                 /* read next unicode character */
 780                 u2 c = utf_nextu2(&utf_ptr);
 781                 if (c == '/') c = '.';
 782                 if (c >= 32 && c <= 127) printf("%c", c);
 783                 else printf("?");
 784         }
 785
 786         fflush(stdout);
 787 }
 788
 789
 790 /* utf_sprint ******************************************************************
 791
 792    Write utf symbol into c-string (for debugging purposes).
 793
 794 *******************************************************************************/
 795
 796 void utf_sprint(char *buffer, utf *u)
 797 {
 798         char *endpos;                       /* points behind utf string           */
 799         char *utf_ptr;                      /* current position in utf text       */
 800         u2 pos = 0;                         /* position in c-string               */
 801
 802         if (!u) {
 803                 strcpy(buffer, "NULL");
 804                 return;
 805         }
 806
 807         endpos = UTF_END(u);
 808         utf_ptr = u->text;
 809
 810         while (utf_ptr < endpos)
 811                 /* copy next unicode character */
 812                 buffer[pos++] = utf_nextu2(&utf_ptr);
 813
 814         /* terminate string */
 815         buffer[pos] = '\0';
 816 }
 817
 818
 819 /* utf_sprint_classname ********************************************************
 820
 821    Write utf symbol into c-string with `/' converted to `.' (for debugging
 822    purposes).
 823
 824 *******************************************************************************/
 825
 826 void utf_sprint_classname(char *buffer, utf *u)
 827 {
 828         char *endpos;                       /* points behind utf string           */
 829         char *utf_ptr;                      /* current position in utf text       */
 830         u2 pos = 0;                         /* position in c-string               */
 831
 832         if (!u) {
 833                 strcpy(buffer, "NULL");
 834                 return;
 835         }
 836
 837         endpos = UTF_END(u);
 838         utf_ptr = u->text;
 839
 840         while (utf_ptr < endpos) {
 841                 /* copy next unicode character */
 842                 u2 c = utf_nextu2(&utf_ptr);
 843                 if (c == '/') c = '.';
 844                 buffer[pos++] = c;
 845         }
 846
 847         /* terminate string */
 848         buffer[pos] = '\0';
 849 }
 850
 851
 852 /* utf_strcat ******************************************************************
 853
 854    Like libc strcat, but uses an utf8 string.
 855
 856 *******************************************************************************/
 857
 858 void utf_strcat(char *buffer, utf *u)
 859 {
 860         utf_sprint(buffer + strlen(buffer), u);
 861 }
 862
 863
 864 /* utf_strcat_classname ********************************************************
 865
 866    Like libc strcat, but uses an utf8 string.
 867
 868 *******************************************************************************/
 869
 870 void utf_strcat_classname(char *buffer, utf *u)
 871 {
 872         utf_sprint_classname(buffer + strlen(buffer), u);
 873 }
 874
 875
 876 /* utf_fprint ******************************************************************
 877
 878    Write utf symbol into file.
 879
 880 *******************************************************************************/
 881
 882 void utf_fprint(FILE *file, utf *u)
 883 {
 884         char *endpos;                       /* points behind utf string           */
 885         char *utf_ptr;                      /* current position in utf text       */
 886
 887         if (!u)
 888                 return;
 889
 890         endpos = UTF_END(u);
 891         utf_ptr = u->text;
 892
 893         while (utf_ptr < endpos) {
 894                 /* read next unicode character */
 895                 u2 c = utf_nextu2(&utf_ptr);
 896
 897                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 898                 else fprintf(file, "?");
 899         }
 900 }
 901
 902
 903 /* utf_fprint_classname ********************************************************
 904
 905    Write utf symbol into file with `/' converted to `.'.
 906
 907 *******************************************************************************/
 908
 909 void utf_fprint_classname(FILE *file, utf *u)
 910 {
 911         char *endpos;                       /* points behind utf string           */
 912         char *utf_ptr;                      /* current position in utf text       */
 913
 914     if (!u)
 915                 return;
 916
 917         endpos = UTF_END(u);
 918         utf_ptr = u->text;
 919
 920         while (utf_ptr < endpos) {
 921                 /* read next unicode character */
 922                 u2 c = utf_nextu2(&utf_ptr);
 923                 if (c == '/') c = '.';
 924
 925                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 926                 else fprintf(file, "?");
 927         }
 928 }
 929
 930
 931 /* is_valid_utf ****************************************************************
 932
 933    Return true if the given string is a valid UTF-8 string.
 934
 935    utf_ptr...points to first character
 936    end_pos...points after last character
 937
 938 *******************************************************************************/
 939
 940 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 941
 942 bool is_valid_utf(char *utf_ptr, char *end_pos)
 943 {
 944         int bytes;
 945         int len,i;
 946         char c;
 947         unsigned long v;
 948
 949         if (end_pos < utf_ptr) return false;
 950         bytes = end_pos - utf_ptr;
 951         while (bytes--) {
 952                 c = *utf_ptr++;
 953
 954                 if (!c) return false;                     /* 0x00 is not allowed */
 955                 if ((c & 0x80) == 0) continue;            /* ASCII */
 956
 957                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 958                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 959                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 960                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 961                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 962                 else return false;                        /* invalid leading byte */
 963
 964                 if (len > 2) return false;                /* Java limitation */
 965
 966                 v = (unsigned long)c & (0x3f >> len);
 967
 968                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 969
 970                 for (i = len; i--; ) {
 971                         c = *utf_ptr++;
 972                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 973                                 return false;
 974                         v = (v << 6) | (c & 0x3f);
 975                 }
 976
 977                 if (v == 0) {
 978                         if (len != 1) return false;           /* Java special */
 979
 980                 } else {
 981                         /* Sun Java seems to allow overlong UTF-8 encodings */
 982
 983                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 984                                 if (!opt_liberalutf)
 985                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 986                                 /* XXX change this to exception? */
 987                         }
 988                 }
 989
 990                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 991                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 992
 993                 /* even these seem to be allowed */
 994                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 995         }
 996
 997         return true;
 998 }
 999
1000
1001 /* is_valid_name ***************************************************************
1002
1003    Return true if the given string may be used as a class/field/method
1004    name. (Currently this only disallows empty strings and control
1005    characters.)
1006
1007    NOTE: The string is assumed to have passed is_valid_utf!
1008
1009    utf_ptr...points to first character
1010    end_pos...points after last character
1011
1012 *******************************************************************************/
1013
1014 bool is_valid_name(char *utf_ptr, char *end_pos)
1015 {
1016         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1017
1018         while (utf_ptr < end_pos) {
1019                 unsigned char c = *utf_ptr++;
1020
1021                 if (c < 0x20) return false; /* disallow control characters */
1022                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1023                         return false;
1024         }
1025
1026         return true;
1027 }
1028
1029 bool is_valid_name_utf(utf *u)
1030 {
1031         return is_valid_name(u->text, UTF_END(u));
1032 }
1033
1034
1035 /* utf_show ********************************************************************
1036
1037    Writes the utf symbols in the utfhash to stdout and displays the
1038    number of external hash chains grouped according to the chainlength
1039    (for debugging purposes).
1040
1041 *******************************************************************************/
1042
1043 void utf_show(void)
1044 {
1045
1046 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1047
1048         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1049         u4 max_chainlength = 0;      /* maximum length of the chains */
1050         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1051         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1052         u4 i;
1053
1054         printf ("UTF-HASH:\n");
1055
1056         /* show element of utf-hashtable */
1057         for (i=0; i<utf_hash.size; i++) {
1058                 utf *u = utf_hash.ptr[i];
1059                 if (u) {
1060                         printf ("SLOT %d: ", (int) i);
1061                         while (u) {
1062                                 printf ("'");
1063                                 utf_display (u);
1064                                 printf ("' ");
1065                                 u = u->hashlink;
1066                         }
1067                         printf ("\n");
1068                 }
1069
1070         }
1071
1072         printf ("UTF-HASH: %d slots for %d entries\n",
1073                         (int) utf_hash.size, (int) utf_hash.entries );
1074
1075
1076         if (utf_hash.entries == 0)
1077                 return;
1078
1079         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1080
1081         for (i=0;i<CHAIN_LIMIT;i++)
1082                 chain_count[i]=0;
1083
1084         /* count numbers of hashchains according to their length */
1085         for (i=0; i<utf_hash.size; i++) {
1086
1087                 utf *u = (utf*) utf_hash.ptr[i];
1088                 u4 chain_length = 0;
1089
1090                 /* determine chainlength */
1091                 while (u) {
1092                         u = u->hashlink;
1093                         chain_length++;
1094                 }
1095
1096                 /* update sum of all chainlengths */
1097                 sum_chainlength+=chain_length;
1098
1099                 /* determine the maximum length of the chains */
1100                 if (chain_length>max_chainlength)
1101                         max_chainlength = chain_length;
1102
1103                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1104                 if (chain_length>=CHAIN_LIMIT) {
1105                         beyond_limit+=chain_length;
1106                         chain_length=CHAIN_LIMIT-1;
1107                 }
1108
1109                 /* update number of hashchains of current length */
1110                 chain_count[chain_length]++;
1111         }
1112
1113         /* display results */
1114         for (i=1;i<CHAIN_LIMIT-1;i++)
1115                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1116
1117         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1118
1119
1120         printf("max. chainlength:%5d\n",max_chainlength);
1121
1122         /* avg. chainlength = sum of chainlengths / number of chains */
1123         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1124 }
1125
1126
1127 /*
1128  * These are local overrides for various environment variables in Emacs.
1129  * Please do not remove this and leave it at the end of the file, where
1130  * Emacs will automagically detect them.
1131  * ---------------------------------------------------------------------
1132  * Local variables:
1133  * mode: c
1134  * indent-tabs-mode: t
1135  * c-basic-offset: 4
1136  * tab-width: 4
1137  * End:
1138  */