src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2277 2005-04-12 19:48:03Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/tables.h"
  45 #include "vm/utf8.h"
  46
  47
  48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  49
  50
  51 /* utf-symbols for pointer comparison of frequently used strings **************/
  52
  53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  54
  55 utf *utf_java_lang_Class;
  56 utf *utf_java_lang_ClassLoader;
  57 utf *utf_java_lang_Cloneable;
  58 utf *utf_java_lang_SecurityManager;
  59 utf *utf_java_lang_String;
  60 utf *utf_java_lang_System;
  61 utf *utf_java_io_Serializable;
  62
  63 utf *utf_java_lang_Throwable;
  64 utf *utf_java_lang_VMThrowable;
  65 utf *utf_java_lang_Error;
  66 utf *utf_java_lang_Exception;
  67 utf *utf_java_lang_NoClassDefFoundError;
  68 utf *utf_java_lang_OutOfMemoryError;
  69 utf *utf_java_lang_ClassNotFoundException;
  70
  71 utf* utf_java_lang_Void;
  72 utf* utf_java_lang_Boolean;
  73 utf* utf_java_lang_Byte;
  74 utf* utf_java_lang_Character;
  75 utf* utf_java_lang_Short;
  76 utf* utf_java_lang_Integer;
  77 utf* utf_java_lang_Long;
  78 utf* utf_java_lang_Float;
  79 utf* utf_java_lang_Double;
  80
  81 utf *utf_java_util_Vector;
  82
  83 utf *utf_InnerClasses;                  /* InnerClasses                       */
  84 utf *utf_ConstantValue;                 /* ConstantValue                      */
  85 utf *utf_Code;                          /* Code                               */
  86 utf *utf_Exceptions;                    /* Exceptions                         */
  87 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  88 utf *utf_SourceFile;                    /* SourceFile                         */
  89
  90 utf *utf_init;                          /* <init>                             */
  91 utf *utf_clinit;                        /* <clinit>                           */
  92 utf *utf_finalize;                      /* finalize                           */
  93
  94 utf *utf_printStackTrace;
  95 utf *utf_fillInStackTrace;
  96 utf *utf_loadClass;
  97
  98 utf *utf_void__void;                    /* ()V                                */
  99 utf *utf_boolean__void;                 /* (Z)V                               */
 100 utf *utf_byte__void;                    /* (B)V                               */
 101 utf *utf_char__void;                    /* (C)V                               */
 102 utf *utf_short__void;                   /* (S)V                               */
 103 utf *utf_int__void;                     /* (I)V                               */
 104 utf *utf_long__void;                    /* (J)V                               */
 105 utf *utf_float__void;                   /* (F)V                               */
 106 utf *utf_double__void;                  /* (D)V                               */
 107 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 108 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 109 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 110 utf *utf_java_lang_String__java_lang_Class;
 111 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 112
 113 utf *array_packagename;
 114
 115
 116 /* utf_init ********************************************************************
 117
 118    Initializes the utf8 subsystem.
 119
 120 *******************************************************************************/
 121
 122 void utf8_init(void)
 123 {
 124         /* create utf-symbols for pointer comparison of frequently used strings */
 125
 126         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 127
 128         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 129         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 130         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 131         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 132         utf_java_lang_String           = utf_new_char("java/lang/String");
 133         utf_java_lang_System           = utf_new_char("java/lang/System");
 134         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 135
 136         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 137         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 138         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 139         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 140
 141         utf_java_lang_NoClassDefFoundError =
 142                 utf_new_char(string_java_lang_NoClassDefFoundError);
 143
 144         utf_java_lang_OutOfMemoryError =
 145                 utf_new_char(string_java_lang_OutOfMemoryError);
 146
 147         utf_java_lang_ClassNotFoundException =
 148                 utf_new_char(string_java_lang_ClassNotFoundException);
 149
 150         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 151         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 152         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 153         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 154         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 155         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 156         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 157         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 158         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 159
 160         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 161
 162         utf_InnerClasses               = utf_new_char("InnerClasses");
 163         utf_ConstantValue              = utf_new_char("ConstantValue");
 164         utf_Code                       = utf_new_char("Code");
 165         utf_Exceptions                 = utf_new_char("Exceptions");
 166         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 167         utf_SourceFile                 = utf_new_char("SourceFile");
 168
 169         utf_init                           = utf_new_char("<init>");
 170         utf_clinit                         = utf_new_char("<clinit>");
 171         utf_finalize                   = utf_new_char("finalize");
 172
 173         utf_printStackTrace            = utf_new_char("printStackTrace");
 174         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 175         utf_loadClass                  = utf_new_char("loadClass");
 176
 177         utf_void__void                 = utf_new_char("()V");
 178         utf_boolean__void              = utf_new_char("(Z)V");
 179         utf_byte__void                 = utf_new_char("(B)V");
 180         utf_char__void                 = utf_new_char("(C)V");
 181         utf_short__void                = utf_new_char("(S)V");
 182         utf_int__void                  = utf_new_char("(I)V");
 183         utf_long__void                 = utf_new_char("(J)V");
 184         utf_float__void                = utf_new_char("(F)V");
 185         utf_double__void               = utf_new_char("(D)V");
 186         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 187         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 188         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 189
 190         utf_java_lang_String__java_lang_Class =
 191                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 192
 193         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 194
 195         array_packagename              = utf_new_char("\t<the array package>");
 196 }
 197
 198
 199 /* utf_hashkey *****************************************************************
 200
 201    The hashkey is computed from the utf-text by using up to 8
 202    characters.  For utf-symbols longer than 15 characters 3 characters
 203    are taken from the beginning and the end, 2 characters are taken
 204    from the middle.
 205
 206 *******************************************************************************/
 207
 208 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 209 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 210
 211 u4 utf_hashkey(const char *text, u4 length)
 212 {
 213         const char *start_pos = text;       /* pointer to utf text                */
 214         u4 a;
 215
 216         switch (length) {
 217         case 0: /* empty string */
 218                 return 0;
 219
 220         case 1: return fbs(0);
 221         case 2: return fbs(0) ^ nbs(3);
 222         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 223         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 224         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 225         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 226         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 227         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 228
 229         case 9:
 230                 a = fbs(0);
 231                 a ^= nbs(1);
 232                 a ^= nbs(2);
 233                 text++;
 234                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 235
 236         case 10:
 237                 a = fbs(0);
 238                 text++;
 239                 a ^= nbs(2);
 240                 a ^= nbs(3);
 241                 a ^= nbs(4);
 242                 text++;
 243                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 244
 245         case 11:
 246                 a = fbs(0);
 247                 text++;
 248                 a ^= nbs(2);
 249                 a ^= nbs(3);
 250                 a ^= nbs(4);
 251                 text++;
 252                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 253
 254         case 12:
 255                 a = fbs(0);
 256                 text += 2;
 257                 a ^= nbs(2);
 258                 a ^= nbs(3);
 259                 text++;
 260                 a ^= nbs(5);
 261                 a ^= nbs(6);
 262                 a ^= nbs(7);
 263                 text++;
 264                 return a ^ nbs(9) ^ nbs(10);
 265
 266         case 13:
 267                 a = fbs(0);
 268                 a ^= nbs(1);
 269                 text++;
 270                 a ^= nbs(3);
 271                 a ^= nbs(4);
 272                 text += 2;
 273                 a ^= nbs(7);
 274                 a ^= nbs(8);
 275                 text += 2;
 276                 return a ^ nbs(9) ^ nbs(10);
 277
 278         case 14:
 279                 a = fbs(0);
 280                 text += 2;
 281                 a ^= nbs(3);
 282                 a ^= nbs(4);
 283                 text += 2;
 284                 a ^= nbs(7);
 285                 a ^= nbs(8);
 286                 text += 2;
 287                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 288
 289         case 15:
 290                 a = fbs(0);
 291                 text += 2;
 292                 a ^= nbs(3);
 293                 a ^= nbs(4);
 294                 text += 2;
 295                 a ^= nbs(7);
 296                 a ^= nbs(8);
 297                 text += 2;
 298                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 299
 300         default:  /* 3 characters from beginning */
 301                 a = fbs(0);
 302                 text += 2;
 303                 a ^= nbs(3);
 304                 a ^= nbs(4);
 305
 306                 /* 2 characters from middle */
 307                 text = start_pos + (length / 2);
 308                 a ^= fbs(5);
 309                 text += 2;
 310                 a ^= nbs(6);
 311
 312                 /* 3 characters from end */
 313                 text = start_pos + length - 4;
 314
 315                 a ^= fbs(7);
 316                 text++;
 317
 318                 return a ^ nbs(10) ^ nbs(11);
 319     }
 320 }
 321
 322
 323 /* utf_hashkey *****************************************************************
 324
 325    Compute the hashkey of a unicode string.
 326
 327 *******************************************************************************/
 328
 329 u4 unicode_hashkey(u2 *text, u2 len)
 330 {
 331         return utf_hashkey((char *) text, len);
 332 }
 333
 334
 335 /* utf_new *********************************************************************
 336
 337    Creates a new utf-symbol, the text of the symbol is passed as a
 338    u1-array. The function searches the utf-hashtable for a utf-symbol
 339    with this text. On success the element returned, otherwise a new
 340    hashtable element is created.
 341
 342    If the number of entries in the hashtable exceeds twice the size of
 343    the hashtable slots a reorganization of the hashtable is done and
 344    the utf symbols are copied to a new hashtable with doubled size.
 345
 346 *******************************************************************************/
 347
 348 utf *utf_new_intern(const char *text, u2 length);
 349
 350 utf *utf_new(const char *text, u2 length)
 351 {
 352     utf *r;
 353
 354 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 355     tables_lock();
 356 #endif
 357
 358     r = utf_new_intern(text, length);
 359
 360 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 361     tables_unlock();
 362 #endif
 363
 364     return r;
 365 }
 366
 367
 368 utf *utf_new_intern(const char *text, u2 length)
 369 {
 370         u4 key;                             /* hashkey computed from utf-text     */
 371         u4 slot;                            /* slot in hashtable                  */
 372         utf *u;                             /* hashtable element                  */
 373         u2 i;
 374
 375 #ifdef STATISTICS
 376         if (opt_stat)
 377                 count_utf_new++;
 378 #endif
 379
 380         key  = utf_hashkey(text, length);
 381         slot = key & (utf_hash.size - 1);
 382         u    = utf_hash.ptr[slot];
 383
 384         /* search external hash chain for utf-symbol */
 385         while (u) {
 386                 if (u->blength == length) {
 387
 388                         /* compare text of hashtable elements */
 389                         for (i = 0; i < length; i++)
 390                                 if (text[i] != u->text[i]) goto nomatch;
 391
 392 #ifdef STATISTICS
 393                         if (opt_stat)
 394                                 count_utf_new_found++;
 395 #endif
 396
 397                         /* symbol found in hashtable */
 398                         return u;
 399                 }
 400         nomatch:
 401                 u = u->hashlink; /* next element in external chain */
 402         }
 403
 404 #ifdef STATISTICS
 405         if (opt_stat)
 406                 count_utf_len += sizeof(utf) + length;
 407 #endif
 408
 409         /* location in hashtable found, create new utf element */
 410         u = NEW(utf);
 411         u->blength  = length;               /* length in bytes of utfstring       */
 412         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 413         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 414         memcpy(u->text, text, length);      /* copy utf-text                      */
 415         u->text[length] = '\0';
 416         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 417
 418         utf_hash.entries++;                 /* update number of entries           */
 419
 420         if (utf_hash.entries > (utf_hash.size * 2)) {
 421
 422         /* reorganization of hashtable, average length of
 423            the external chains is approx. 2                */
 424
 425                 u4 i;
 426                 utf *u;
 427                 hashtable newhash; /* the new hashtable */
 428
 429                 /* create new hashtable, double the size */
 430                 init_hashtable(&newhash, utf_hash.size * 2);
 431                 newhash.entries = utf_hash.entries;
 432
 433 #ifdef STATISTICS
 434                 if (opt_stat)
 435                         count_utf_len += sizeof(utf*) * utf_hash.size;
 436 #endif
 437
 438                 /* transfer elements to new hashtable */
 439                 for (i = 0; i < utf_hash.size; i++) {
 440                         u = (utf *) utf_hash.ptr[i];
 441                         while (u) {
 442                                 utf *nextu = u->hashlink;
 443                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 444
 445                                 u->hashlink = (utf *) newhash.ptr[slot];
 446                                 newhash.ptr[slot] = u;
 447
 448                                 /* follow link in external hash chain */
 449                                 u = nextu;
 450                         }
 451                 }
 452
 453                 /* dispose old table */
 454                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 455                 utf_hash = newhash;
 456         }
 457
 458         return u;
 459 }
 460
 461
 462 /* utf_new_u2 ******************************************************************
 463
 464    Make utf symbol from u2 array, if isclassname is true '.' is
 465    replaced by '/'.
 466
 467 *******************************************************************************/
 468
 469 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 470 {
 471         char *buffer;                   /* memory buffer for  unicode characters  */
 472         char *pos;                      /* pointer to current position in buffer  */
 473         u4 left;                        /* unicode characters left                */
 474         u4 buflength;                   /* utf length in bytes of the u2 array    */
 475         utf *result;                    /* resulting utf-string                   */
 476         int i;
 477
 478         /* determine utf length in bytes and allocate memory */
 479
 480         buflength = u2_utflength(unicode_pos, unicode_length);
 481         buffer    = MNEW(char, buflength);
 482
 483         left = buflength;
 484         pos  = buffer;
 485
 486         for (i = 0; i++ < unicode_length; unicode_pos++) {
 487                 /* next unicode character */
 488                 u2 c = *unicode_pos;
 489
 490                 if ((c != 0) && (c < 0x80)) {
 491                         /* 1 character */
 492                         left--;
 493                 if ((int) left < 0) break;
 494                         /* convert classname */
 495                         if (isclassname && c == '.')
 496                                 *pos++ = '/';
 497                         else
 498                                 *pos++ = (char) c;
 499
 500                 } else if (c < 0x800) {
 501                         /* 2 characters */
 502                 unsigned char high = c >> 6;
 503                 unsigned char low  = c & 0x3F;
 504                         left = left - 2;
 505                 if ((int) left < 0) break;
 506                 *pos++ = high | 0xC0;
 507                 *pos++ = low  | 0x80;
 508
 509                 } else {
 510                 /* 3 characters */
 511                 char low  = c & 0x3f;
 512                 char mid  = (c >> 6) & 0x3F;
 513                 char high = c >> 12;
 514                         left = left - 3;
 515                 if ((int) left < 0) break;
 516                 *pos++ = high | 0xE0;
 517                 *pos++ = mid  | 0x80;
 518                 *pos++ = low  | 0x80;
 519                 }
 520         }
 521
 522         /* insert utf-string into symbol-table */
 523         result = utf_new(buffer,buflength);
 524
 525         MFREE(buffer, char, buflength);
 526
 527         return result;
 528 }
 529
 530
 531 /* utf_new_char ****************************************************************
 532
 533    Creates a new utf symbol, the text for this symbol is passed as a
 534    c-string ( = char* ).
 535
 536 *******************************************************************************/
 537
 538 utf *utf_new_char(const char *text)
 539 {
 540         return utf_new(text, strlen(text));
 541 }
 542
 543
 544 /* utf_new_char_classname ******************************************************
 545
 546    Creates a new utf symbol, the text for this symbol is passed as a
 547    c-string ( = char* ) "." characters are going to be replaced by
 548    "/". Since the above function is used often, this is a separte
 549    function, instead of an if.
 550
 551 *******************************************************************************/
 552
 553 utf *utf_new_char_classname(const char *text)
 554 {
 555         if (strchr(text, '.')) {
 556                 char *txt = strdup(text);
 557                 char *end = txt + strlen(txt);
 558                 char *c;
 559                 utf *tmpRes;
 560
 561                 for (c = txt; c < end; c++)
 562                         if (*c == '.') *c = '/';
 563
 564                 tmpRes = utf_new(txt, strlen(txt));
 565                 FREE(txt, 0);
 566
 567                 return tmpRes;
 568
 569         } else
 570                 return utf_new(text, strlen(text));
 571 }
 572
 573
 574 /* utf_nextu2 ******************************************************************
 575
 576    Read the next unicode character from the utf string and increment
 577    the utf-string pointer accordingly.
 578
 579 *******************************************************************************/
 580
 581 u2 utf_nextu2(char **utf_ptr)
 582 {
 583     /* uncompressed unicode character */
 584     u2 unicode_char = 0;
 585     /* current position in utf text */
 586     unsigned char *utf = (unsigned char *) (*utf_ptr);
 587     /* bytes representing the unicode character */
 588     unsigned char ch1, ch2, ch3;
 589     /* number of bytes used to represent the unicode character */
 590     int len = 0;
 591
 592     switch ((ch1 = utf[0]) >> 4) {
 593         default: /* 1 byte */
 594                 (*utf_ptr)++;
 595                 return (u2) ch1;
 596         case 0xC:
 597         case 0xD: /* 2 bytes */
 598                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 599                         unsigned char high = ch1 & 0x1F;
 600                         unsigned char low  = ch2 & 0x3F;
 601                         unicode_char = (high << 6) + low;
 602                         len = 2;
 603                 }
 604                 break;
 605
 606         case 0xE: /* 2 or 3 bytes */
 607                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 608                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 609                                 unsigned char low  = ch3 & 0x3f;
 610                                 unsigned char mid  = ch2 & 0x3f;
 611                                 unsigned char high = ch1 & 0x0f;
 612                                 unicode_char = (((high << 6) + mid) << 6) + low;
 613                                 len = 3;
 614                         } else
 615                                 len = 2;
 616                 }
 617                 break;
 618     }
 619
 620     /* update position in utf-text */
 621     *utf_ptr = (char *) (utf + len);
 622
 623     return unicode_char;
 624 }
 625
 626
 627 /* utf_strlen ******************************************************************
 628
 629    Determine number of unicode characters in the utf string.
 630
 631 *******************************************************************************/
 632
 633 u4 utf_strlen(utf *u)
 634 {
 635         char *endpos;                       /* points behind utf string           */
 636         char *utf_ptr;                      /* current position in utf text       */
 637         u4 len = 0;                         /* number of unicode characters       */
 638
 639         if (!u) {
 640                 *exceptionptr = new_nullpointerexception();
 641                 return 0;
 642         }
 643
 644         endpos = utf_end(u);
 645         utf_ptr = u->text;
 646
 647         while (utf_ptr < endpos) {
 648                 len++;
 649                 /* next unicode character */
 650                 utf_nextu2(&utf_ptr);
 651         }
 652
 653         if (utf_ptr != endpos)
 654                 /* string ended abruptly */
 655                 throw_cacao_exception_exit(string_java_lang_InternalError,
 656                                                                    "Illegal utf8 string");
 657
 658         return len;
 659 }
 660
 661
 662 /* u2_utflength ****************************************************************
 663
 664    Returns the utf length in bytes of a u2 array.
 665
 666 *******************************************************************************/
 667
 668 u4 u2_utflength(u2 *text, u4 u2_length)
 669 {
 670         u4 result_len = 0;                  /* utf length in bytes                */
 671         u2 ch;                              /* current unicode character          */
 672         u4 len;
 673
 674         for (len = 0; len < u2_length; len++) {
 675                 /* next unicode character */
 676                 ch = *text++;
 677
 678                 /* determine bytes required to store unicode character as utf */
 679                 if (ch && (ch < 0x80))
 680                         result_len++;
 681                 else if (ch < 0x800)
 682                         result_len += 2;
 683                 else
 684                         result_len += 3;
 685         }
 686
 687     return result_len;
 688 }
 689
 690
 691 /* utf_display *****************************************************************
 692
 693    Write utf symbol to stdout (for debugging purposes).
 694
 695 *******************************************************************************/
 696
 697 void utf_display(utf *u)
 698 {
 699         char *endpos;                       /* points behind utf string           */
 700         char *utf_ptr;                      /* current position in utf text       */
 701
 702         if (!u) {
 703                 printf("NULL");
 704                 fflush(stdout);
 705                 return;
 706         }
 707
 708         endpos = utf_end(u);
 709         utf_ptr = u->text;
 710
 711         while (utf_ptr < endpos) {
 712                 /* read next unicode character */
 713                 u2 c = utf_nextu2(&utf_ptr);
 714                 if (c >= 32 && c <= 127) printf("%c", c);
 715                 else printf("?");
 716         }
 717
 718         fflush(stdout);
 719 }
 720
 721
 722 /* utf_display_classname *******************************************************
 723
 724    Write utf symbol to stdout with `/' converted to `.' (for debugging
 725    purposes).
 726
 727 *******************************************************************************/
 728
 729 void utf_display_classname(utf *u)
 730 {
 731         char *endpos;                       /* points behind utf string           */
 732         char *utf_ptr;                      /* current position in utf text       */
 733
 734         if (!u) {
 735                 printf("NULL");
 736                 fflush(stdout);
 737                 return;
 738         }
 739
 740         endpos = utf_end(u);
 741         utf_ptr = u->text;
 742
 743         while (utf_ptr < endpos) {
 744                 /* read next unicode character */
 745                 u2 c = utf_nextu2(&utf_ptr);
 746                 if (c == '/') c = '.';
 747                 if (c >= 32 && c <= 127) printf("%c", c);
 748                 else printf("?");
 749         }
 750
 751         fflush(stdout);
 752 }
 753
 754
 755 /* utf_sprint ******************************************************************
 756
 757    Write utf symbol into c-string (for debugging purposes).
 758
 759 *******************************************************************************/
 760
 761 void utf_sprint(char *buffer, utf *u)
 762 {
 763         char *endpos;                       /* points behind utf string           */
 764         char *utf_ptr;                      /* current position in utf text       */
 765         u2 pos = 0;                         /* position in c-string               */
 766
 767         if (!u) {
 768                 strcpy(buffer, "NULL");
 769                 return;
 770         }
 771
 772         endpos = utf_end(u);
 773         utf_ptr = u->text;
 774
 775         while (utf_ptr < endpos)
 776                 /* copy next unicode character */
 777                 buffer[pos++] = utf_nextu2(&utf_ptr);
 778
 779         /* terminate string */
 780         buffer[pos] = '\0';
 781 }
 782
 783
 784 /* utf_sprint_classname ********************************************************
 785
 786    Write utf symbol into c-string with `/' converted to `.' (for debugging
 787    purposes).
 788
 789 *******************************************************************************/
 790
 791 void utf_sprint_classname(char *buffer, utf *u)
 792 {
 793         char *endpos;                       /* points behind utf string           */
 794         char *utf_ptr;                      /* current position in utf text       */
 795         u2 pos = 0;                         /* position in c-string               */
 796
 797         if (!u) {
 798                 strcpy(buffer, "NULL");
 799                 return;
 800         }
 801
 802         endpos = utf_end(u);
 803         utf_ptr = u->text;
 804
 805         while (utf_ptr < endpos) {
 806                 /* copy next unicode character */
 807                 u2 c = utf_nextu2(&utf_ptr);
 808                 if (c == '/') c = '.';
 809                 buffer[pos++] = c;
 810         }
 811
 812         /* terminate string */
 813         buffer[pos] = '\0';
 814 }
 815
 816
 817 /* utf_strcat ******************************************************************
 818
 819    Like libc strcat, but uses an utf8 string.
 820
 821 *******************************************************************************/
 822
 823 void utf_strcat(char *buffer, utf *u)
 824 {
 825         utf_sprint(buffer + strlen(buffer), u);
 826 }
 827
 828
 829 /* utf_strcat_classname ********************************************************
 830
 831    Like libc strcat, but uses an utf8 string.
 832
 833 *******************************************************************************/
 834
 835 void utf_strcat_classname(char *buffer, utf *u)
 836 {
 837         utf_sprint_classname(buffer + strlen(buffer), u);
 838 }
 839
 840
 841 /* utf_fprint ******************************************************************
 842
 843    Write utf symbol into file.
 844
 845 *******************************************************************************/
 846
 847 void utf_fprint(FILE *file, utf *u)
 848 {
 849         char *endpos;                       /* points behind utf string           */
 850         char *utf_ptr;                      /* current position in utf text       */
 851
 852         if (!u)
 853                 return;
 854
 855         endpos = utf_end(u);
 856         utf_ptr = u->text;
 857
 858         while (utf_ptr < endpos) {
 859                 /* read next unicode character */
 860                 u2 c = utf_nextu2(&utf_ptr);
 861
 862                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 863                 else fprintf(file, "?");
 864         }
 865 }
 866
 867
 868 /* utf_fprint_classname ********************************************************
 869
 870    Write utf symbol into file with `/' converted to `.'.
 871
 872 *******************************************************************************/
 873
 874 void utf_fprint_classname(FILE *file, utf *u)
 875 {
 876         char *endpos;                       /* points behind utf string           */
 877         char *utf_ptr;                      /* current position in utf text       */
 878
 879     if (!u)
 880                 return;
 881
 882         endpos = utf_end(u);
 883         utf_ptr = u->text;
 884
 885         while (utf_ptr < endpos) {
 886                 /* read next unicode character */
 887                 u2 c = utf_nextu2(&utf_ptr);
 888                 if (c == '/') c = '.';
 889
 890                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 891                 else fprintf(file, "?");
 892         }
 893 }
 894
 895
 896 /* is_valid_utf ****************************************************************
 897
 898    Return true if the given string is a valid UTF-8 string.
 899
 900    utf_ptr...points to first character
 901    end_pos...points after last character
 902
 903 *******************************************************************************/
 904
 905 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 906
 907 bool is_valid_utf(char *utf_ptr, char *end_pos)
 908 {
 909         int bytes;
 910         int len,i;
 911         char c;
 912         unsigned long v;
 913
 914         if (end_pos < utf_ptr) return false;
 915         bytes = end_pos - utf_ptr;
 916         while (bytes--) {
 917                 c = *utf_ptr++;
 918
 919                 if (!c) return false;                     /* 0x00 is not allowed */
 920                 if ((c & 0x80) == 0) continue;            /* ASCII */
 921
 922                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 923                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 924                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 925                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 926                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 927                 else return false;                        /* invalid leading byte */
 928
 929                 if (len > 2) return false;                /* Java limitation */
 930
 931                 v = (unsigned long)c & (0x3f >> len);
 932
 933                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 934
 935                 for (i = len; i--; ) {
 936                         c = *utf_ptr++;
 937                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 938                                 return false;
 939                         v = (v << 6) | (c & 0x3f);
 940                 }
 941
 942                 if (v == 0) {
 943                         if (len != 1) return false;           /* Java special */
 944
 945                 } else {
 946                         /* Sun Java seems to allow overlong UTF-8 encodings */
 947
 948                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 949                                 if (!opt_liberalutf)
 950                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 951                                 /* XXX change this to panic? */
 952                         }
 953                 }
 954
 955                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 956                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 957
 958                 /* even these seem to be allowed */
 959                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 960         }
 961
 962         return true;
 963 }
 964
 965
 966 /* is_valid_name ***************************************************************
 967
 968    Return true if the given string may be used as a class/field/method
 969    name. (Currently this only disallows empty strings and control
 970    characters.)
 971
 972    NOTE: The string is assumed to have passed is_valid_utf!
 973
 974    utf_ptr...points to first character
 975    end_pos...points after last character
 976
 977 *******************************************************************************/
 978
 979 bool is_valid_name(char *utf_ptr, char *end_pos)
 980 {
 981         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 982
 983         while (utf_ptr < end_pos) {
 984                 unsigned char c = *utf_ptr++;
 985
 986                 if (c < 0x20) return false; /* disallow control characters */
 987                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
 988                         return false;
 989         }
 990
 991         return true;
 992 }
 993
 994 bool is_valid_name_utf(utf *u)
 995 {
 996         return is_valid_name(u->text,utf_end(u));
 997 }
 998
 999
1000 /* utf_show ********************************************************************
1001
1002    Writes the utf symbols in the utfhash to stdout and displays the
1003    number of external hash chains grouped according to the chainlength
1004    (for debugging purposes).
1005
1006 *******************************************************************************/
1007
1008 void utf_show(void)
1009 {
1010
1011 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1012
1013         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1014         u4 max_chainlength = 0;      /* maximum length of the chains */
1015         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1016         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1017         u4 i;
1018
1019         printf ("UTF-HASH:\n");
1020
1021         /* show element of utf-hashtable */
1022         for (i=0; i<utf_hash.size; i++) {
1023                 utf *u = utf_hash.ptr[i];
1024                 if (u) {
1025                         printf ("SLOT %d: ", (int) i);
1026                         while (u) {
1027                                 printf ("'");
1028                                 utf_display (u);
1029                                 printf ("' ");
1030                                 u = u->hashlink;
1031                         }
1032                         printf ("\n");
1033                 }
1034
1035         }
1036
1037         printf ("UTF-HASH: %d slots for %d entries\n",
1038                         (int) utf_hash.size, (int) utf_hash.entries );
1039
1040
1041         if (utf_hash.entries == 0)
1042                 return;
1043
1044         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1045
1046         for (i=0;i<CHAIN_LIMIT;i++)
1047                 chain_count[i]=0;
1048
1049         /* count numbers of hashchains according to their length */
1050         for (i=0; i<utf_hash.size; i++) {
1051
1052                 utf *u = (utf*) utf_hash.ptr[i];
1053                 u4 chain_length = 0;
1054
1055                 /* determine chainlength */
1056                 while (u) {
1057                         u = u->hashlink;
1058                         chain_length++;
1059                 }
1060
1061                 /* update sum of all chainlengths */
1062                 sum_chainlength+=chain_length;
1063
1064                 /* determine the maximum length of the chains */
1065                 if (chain_length>max_chainlength)
1066                         max_chainlength = chain_length;
1067
1068                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1069                 if (chain_length>=CHAIN_LIMIT) {
1070                         beyond_limit+=chain_length;
1071                         chain_length=CHAIN_LIMIT-1;
1072                 }
1073
1074                 /* update number of hashchains of current length */
1075                 chain_count[chain_length]++;
1076         }
1077
1078         /* display results */
1079         for (i=1;i<CHAIN_LIMIT-1;i++)
1080                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1081
1082         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1083
1084
1085         printf("max. chainlength:%5d\n",max_chainlength);
1086
1087         /* avg. chainlength = sum of chainlengths / number of chains */
1088         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1089 }
1090
1091
1092 /*
1093  * These are local overrides for various environment variables in Emacs.
1094  * Please do not remove this and leave it at the end of the file, where
1095  * Emacs will automagically detect them.
1096  * ---------------------------------------------------------------------
1097  * Local variables:
1098  * mode: c
1099  * indent-tabs-mode: t
1100  * c-basic-offset: 4
1101  * tab-width: 4
1102  * End:
1103  */