src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2427 2005-05-01 12:27:54Z jowenn $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/tables.h"
  45 #include "vm/utf8.h"
  46
  47
  48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  49
  50
  51 /* utf-symbols for pointer comparison of frequently used strings **************/
  52
  53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  54
  55 utf *utf_java_lang_Class;
  56 utf *utf_java_lang_ClassLoader;
  57 utf *utf_java_lang_Cloneable;
  58 utf *utf_java_lang_SecurityManager;
  59 utf *utf_java_lang_String;
  60 utf *utf_java_lang_System;
  61 utf *utf_java_lang_ThreadGroup;
  62 utf *utf_java_io_Serializable;
  63
  64 utf *utf_java_lang_Throwable;
  65 utf *utf_java_lang_VMThrowable;
  66 utf *utf_java_lang_Error;
  67 utf *utf_java_lang_Exception;
  68 utf *utf_java_lang_NoClassDefFoundError;
  69 utf *utf_java_lang_OutOfMemoryError;
  70 utf *utf_java_lang_ClassNotFoundException;
  71
  72 utf* utf_java_lang_Void;
  73 utf* utf_java_lang_Boolean;
  74 utf* utf_java_lang_Byte;
  75 utf* utf_java_lang_Character;
  76 utf* utf_java_lang_Short;
  77 utf* utf_java_lang_Integer;
  78 utf* utf_java_lang_Long;
  79 utf* utf_java_lang_Float;
  80 utf* utf_java_lang_Double;
  81
  82 utf *utf_java_util_Vector;
  83 utf *utf_java_lang_reflect_Constructor;
  84 utf *utf_java_lang_reflect_Method;
  85
  86
  87 utf *utf_InnerClasses;                  /* InnerClasses                       */
  88 utf *utf_ConstantValue;                 /* ConstantValue                      */
  89 utf *utf_Code;                          /* Code                               */
  90 utf *utf_Exceptions;                    /* Exceptions                         */
  91 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  92 utf *utf_SourceFile;                    /* SourceFile                         */
  93
  94 utf *utf_init;                          /* <init>                             */
  95 utf *utf_clinit;                        /* <clinit>                           */
  96 utf *utf_finalize;                      /* finalize                           */
  97
  98 utf *utf_printStackTrace;
  99 utf *utf_fillInStackTrace;
 100 utf *utf_loadClass;
 101
 102 utf *utf_void__void;                    /* ()V                                */
 103 utf *utf_boolean__void;                 /* (Z)V                               */
 104 utf *utf_byte__void;                    /* (B)V                               */
 105 utf *utf_char__void;                    /* (C)V                               */
 106 utf *utf_short__void;                   /* (S)V                               */
 107 utf *utf_int__void;                     /* (I)V                               */
 108 utf *utf_long__void;                    /* (J)V                               */
 109 utf *utf_float__void;                   /* (F)V                               */
 110 utf *utf_double__void;                  /* (D)V                               */
 111 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 112 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 113 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 114 utf *utf_java_lang_String__java_lang_Class;
 115 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 116
 117 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 118
 119 utf *array_packagename;
 120
 121
 122 /* utf_init ********************************************************************
 123
 124    Initializes the utf8 subsystem.
 125
 126 *******************************************************************************/
 127
 128 void utf8_init(void)
 129 {
 130         /* create utf-symbols for pointer comparison of frequently used strings */
 131
 132         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 133
 134         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 135         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 136         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 137         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 138         utf_java_lang_String           = utf_new_char("java/lang/String");
 139         utf_java_lang_System           = utf_new_char("java/lang/System");
 140         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 141         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 142
 143         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 144         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 145         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 146         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 147
 148         utf_java_lang_NoClassDefFoundError =
 149                 utf_new_char(string_java_lang_NoClassDefFoundError);
 150
 151         utf_java_lang_OutOfMemoryError =
 152                 utf_new_char(string_java_lang_OutOfMemoryError);
 153
 154         utf_java_lang_ClassNotFoundException =
 155                 utf_new_char(string_java_lang_ClassNotFoundException);
 156
 157         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 158         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 159         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 160         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 161         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 162         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 163         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 164         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 165         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 166
 167         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 168         utf_java_lang_reflect_Constructor = utf_new_char("java/lang/reflect/Constructor");
 169         utf_java_lang_reflect_Method      = utf_new_char("java/lang/reflect/Method");
 170
 171         utf_InnerClasses               = utf_new_char("InnerClasses");
 172         utf_ConstantValue              = utf_new_char("ConstantValue");
 173         utf_Code                       = utf_new_char("Code");
 174         utf_Exceptions                 = utf_new_char("Exceptions");
 175         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 176         utf_SourceFile                 = utf_new_char("SourceFile");
 177
 178         utf_init                           = utf_new_char("<init>");
 179         utf_clinit                         = utf_new_char("<clinit>");
 180         utf_finalize                   = utf_new_char("finalize");
 181
 182         utf_printStackTrace            = utf_new_char("printStackTrace");
 183         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 184         utf_loadClass                  = utf_new_char("loadClass");
 185
 186         utf_void__void                 = utf_new_char("()V");
 187         utf_boolean__void              = utf_new_char("(Z)V");
 188         utf_byte__void                 = utf_new_char("(B)V");
 189         utf_char__void                 = utf_new_char("(C)V");
 190         utf_short__void                = utf_new_char("(S)V");
 191         utf_int__void                  = utf_new_char("(I)V");
 192         utf_long__void                 = utf_new_char("(J)V");
 193         utf_float__void                = utf_new_char("(F)V");
 194         utf_double__void               = utf_new_char("(D)V");
 195         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 196         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 197         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 198
 199         utf_java_lang_String__java_lang_Class =
 200                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 201
 202         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 203
 204         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 205
 206         array_packagename              = utf_new_char("\t<the array package>");
 207 }
 208
 209
 210 /* utf_hashkey *****************************************************************
 211
 212    The hashkey is computed from the utf-text by using up to 8
 213    characters.  For utf-symbols longer than 15 characters 3 characters
 214    are taken from the beginning and the end, 2 characters are taken
 215    from the middle.
 216
 217 *******************************************************************************/
 218
 219 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 220 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 221
 222 u4 utf_hashkey(const char *text, u4 length)
 223 {
 224         const char *start_pos = text;       /* pointer to utf text                */
 225         u4 a;
 226
 227         switch (length) {
 228         case 0: /* empty string */
 229                 return 0;
 230
 231         case 1: return fbs(0);
 232         case 2: return fbs(0) ^ nbs(3);
 233         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 234         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 235         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 236         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 237         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 238         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 239
 240         case 9:
 241                 a = fbs(0);
 242                 a ^= nbs(1);
 243                 a ^= nbs(2);
 244                 text++;
 245                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 246
 247         case 10:
 248                 a = fbs(0);
 249                 text++;
 250                 a ^= nbs(2);
 251                 a ^= nbs(3);
 252                 a ^= nbs(4);
 253                 text++;
 254                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 255
 256         case 11:
 257                 a = fbs(0);
 258                 text++;
 259                 a ^= nbs(2);
 260                 a ^= nbs(3);
 261                 a ^= nbs(4);
 262                 text++;
 263                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 264
 265         case 12:
 266                 a = fbs(0);
 267                 text += 2;
 268                 a ^= nbs(2);
 269                 a ^= nbs(3);
 270                 text++;
 271                 a ^= nbs(5);
 272                 a ^= nbs(6);
 273                 a ^= nbs(7);
 274                 text++;
 275                 return a ^ nbs(9) ^ nbs(10);
 276
 277         case 13:
 278                 a = fbs(0);
 279                 a ^= nbs(1);
 280                 text++;
 281                 a ^= nbs(3);
 282                 a ^= nbs(4);
 283                 text += 2;
 284                 a ^= nbs(7);
 285                 a ^= nbs(8);
 286                 text += 2;
 287                 return a ^ nbs(9) ^ nbs(10);
 288
 289         case 14:
 290                 a = fbs(0);
 291                 text += 2;
 292                 a ^= nbs(3);
 293                 a ^= nbs(4);
 294                 text += 2;
 295                 a ^= nbs(7);
 296                 a ^= nbs(8);
 297                 text += 2;
 298                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 299
 300         case 15:
 301                 a = fbs(0);
 302                 text += 2;
 303                 a ^= nbs(3);
 304                 a ^= nbs(4);
 305                 text += 2;
 306                 a ^= nbs(7);
 307                 a ^= nbs(8);
 308                 text += 2;
 309                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 310
 311         default:  /* 3 characters from beginning */
 312                 a = fbs(0);
 313                 text += 2;
 314                 a ^= nbs(3);
 315                 a ^= nbs(4);
 316
 317                 /* 2 characters from middle */
 318                 text = start_pos + (length / 2);
 319                 a ^= fbs(5);
 320                 text += 2;
 321                 a ^= nbs(6);
 322
 323                 /* 3 characters from end */
 324                 text = start_pos + length - 4;
 325
 326                 a ^= fbs(7);
 327                 text++;
 328
 329                 return a ^ nbs(10) ^ nbs(11);
 330     }
 331 }
 332
 333
 334 /* utf_hashkey *****************************************************************
 335
 336    Compute the hashkey of a unicode string.
 337
 338 *******************************************************************************/
 339
 340 u4 unicode_hashkey(u2 *text, u2 len)
 341 {
 342         return utf_hashkey((char *) text, len);
 343 }
 344
 345
 346 /* utf_new *********************************************************************
 347
 348    Creates a new utf-symbol, the text of the symbol is passed as a
 349    u1-array. The function searches the utf-hashtable for a utf-symbol
 350    with this text. On success the element returned, otherwise a new
 351    hashtable element is created.
 352
 353    If the number of entries in the hashtable exceeds twice the size of
 354    the hashtable slots a reorganization of the hashtable is done and
 355    the utf symbols are copied to a new hashtable with doubled size.
 356
 357 *******************************************************************************/
 358
 359 utf *utf_new_intern(const char *text, u2 length);
 360
 361 utf *utf_new(const char *text, u2 length)
 362 {
 363     utf *r;
 364
 365 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 366     tables_lock();
 367 #endif
 368
 369     r = utf_new_intern(text, length);
 370
 371 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 372     tables_unlock();
 373 #endif
 374
 375     return r;
 376 }
 377
 378
 379 utf *utf_new_intern(const char *text, u2 length)
 380 {
 381         u4 key;                             /* hashkey computed from utf-text     */
 382         u4 slot;                            /* slot in hashtable                  */
 383         utf *u;                             /* hashtable element                  */
 384         u2 i;
 385
 386 #ifdef STATISTICS
 387         if (opt_stat)
 388                 count_utf_new++;
 389 #endif
 390
 391         key  = utf_hashkey(text, length);
 392         slot = key & (utf_hash.size - 1);
 393         u    = utf_hash.ptr[slot];
 394
 395         /* search external hash chain for utf-symbol */
 396         while (u) {
 397                 if (u->blength == length) {
 398
 399                         /* compare text of hashtable elements */
 400                         for (i = 0; i < length; i++)
 401                                 if (text[i] != u->text[i]) goto nomatch;
 402
 403 #ifdef STATISTICS
 404                         if (opt_stat)
 405                                 count_utf_new_found++;
 406 #endif
 407
 408                         /* symbol found in hashtable */
 409                         return u;
 410                 }
 411         nomatch:
 412                 u = u->hashlink; /* next element in external chain */
 413         }
 414
 415 #ifdef STATISTICS
 416         if (opt_stat)
 417                 count_utf_len += sizeof(utf) + length;
 418 #endif
 419
 420         /* location in hashtable found, create new utf element */
 421         u = NEW(utf);
 422         u->blength  = length;               /* length in bytes of utfstring       */
 423         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 424         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 425         memcpy(u->text, text, length);      /* copy utf-text                      */
 426         u->text[length] = '\0';
 427         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 428
 429         utf_hash.entries++;                 /* update number of entries           */
 430
 431         if (utf_hash.entries > (utf_hash.size * 2)) {
 432
 433         /* reorganization of hashtable, average length of
 434            the external chains is approx. 2                */
 435
 436                 u4 i;
 437                 utf *u;
 438                 hashtable newhash; /* the new hashtable */
 439
 440                 /* create new hashtable, double the size */
 441                 init_hashtable(&newhash, utf_hash.size * 2);
 442                 newhash.entries = utf_hash.entries;
 443
 444 #ifdef STATISTICS
 445                 if (opt_stat)
 446                         count_utf_len += sizeof(utf*) * utf_hash.size;
 447 #endif
 448
 449                 /* transfer elements to new hashtable */
 450                 for (i = 0; i < utf_hash.size; i++) {
 451                         u = (utf *) utf_hash.ptr[i];
 452                         while (u) {
 453                                 utf *nextu = u->hashlink;
 454                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 455
 456                                 u->hashlink = (utf *) newhash.ptr[slot];
 457                                 newhash.ptr[slot] = u;
 458
 459                                 /* follow link in external hash chain */
 460                                 u = nextu;
 461                         }
 462                 }
 463
 464                 /* dispose old table */
 465                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 466                 utf_hash = newhash;
 467         }
 468
 469         return u;
 470 }
 471
 472
 473 /* utf_new_u2 ******************************************************************
 474
 475    Make utf symbol from u2 array, if isclassname is true '.' is
 476    replaced by '/'.
 477
 478 *******************************************************************************/
 479
 480 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 481 {
 482         char *buffer;                   /* memory buffer for  unicode characters  */
 483         char *pos;                      /* pointer to current position in buffer  */
 484         u4 left;                        /* unicode characters left                */
 485         u4 buflength;                   /* utf length in bytes of the u2 array    */
 486         utf *result;                    /* resulting utf-string                   */
 487         int i;
 488
 489         /* determine utf length in bytes and allocate memory */
 490
 491         buflength = u2_utflength(unicode_pos, unicode_length);
 492         buffer    = MNEW(char, buflength);
 493
 494         left = buflength;
 495         pos  = buffer;
 496
 497         for (i = 0; i++ < unicode_length; unicode_pos++) {
 498                 /* next unicode character */
 499                 u2 c = *unicode_pos;
 500
 501                 if ((c != 0) && (c < 0x80)) {
 502                         /* 1 character */
 503                         left--;
 504                 if ((int) left < 0) break;
 505                         /* convert classname */
 506                         if (isclassname && c == '.')
 507                                 *pos++ = '/';
 508                         else
 509                                 *pos++ = (char) c;
 510
 511                 } else if (c < 0x800) {
 512                         /* 2 characters */
 513                 unsigned char high = c >> 6;
 514                 unsigned char low  = c & 0x3F;
 515                         left = left - 2;
 516                 if ((int) left < 0) break;
 517                 *pos++ = high | 0xC0;
 518                 *pos++ = low  | 0x80;
 519
 520                 } else {
 521                 /* 3 characters */
 522                 char low  = c & 0x3f;
 523                 char mid  = (c >> 6) & 0x3F;
 524                 char high = c >> 12;
 525                         left = left - 3;
 526                 if ((int) left < 0) break;
 527                 *pos++ = high | 0xE0;
 528                 *pos++ = mid  | 0x80;
 529                 *pos++ = low  | 0x80;
 530                 }
 531         }
 532
 533         /* insert utf-string into symbol-table */
 534         result = utf_new(buffer,buflength);
 535
 536         MFREE(buffer, char, buflength);
 537
 538         return result;
 539 }
 540
 541
 542 /* utf_new_char ****************************************************************
 543
 544    Creates a new utf symbol, the text for this symbol is passed as a
 545    c-string ( = char* ).
 546
 547 *******************************************************************************/
 548
 549 utf *utf_new_char(const char *text)
 550 {
 551         return utf_new(text, strlen(text));
 552 }
 553
 554
 555 /* utf_new_char_classname ******************************************************
 556
 557    Creates a new utf symbol, the text for this symbol is passed as a
 558    c-string ( = char* ) "." characters are going to be replaced by
 559    "/". Since the above function is used often, this is a separte
 560    function, instead of an if.
 561
 562 *******************************************************************************/
 563
 564 utf *utf_new_char_classname(const char *text)
 565 {
 566         if (strchr(text, '.')) {
 567                 char *txt = strdup(text);
 568                 char *end = txt + strlen(txt);
 569                 char *c;
 570                 utf *tmpRes;
 571
 572                 for (c = txt; c < end; c++)
 573                         if (*c == '.') *c = '/';
 574
 575                 tmpRes = utf_new(txt, strlen(txt));
 576                 FREE(txt, 0);
 577
 578                 return tmpRes;
 579
 580         } else
 581                 return utf_new(text, strlen(text));
 582 }
 583
 584
 585 /* utf_nextu2 ******************************************************************
 586
 587    Read the next unicode character from the utf string and increment
 588    the utf-string pointer accordingly.
 589
 590 *******************************************************************************/
 591
 592 u2 utf_nextu2(char **utf_ptr)
 593 {
 594     /* uncompressed unicode character */
 595     u2 unicode_char = 0;
 596     /* current position in utf text */
 597     unsigned char *utf = (unsigned char *) (*utf_ptr);
 598     /* bytes representing the unicode character */
 599     unsigned char ch1, ch2, ch3;
 600     /* number of bytes used to represent the unicode character */
 601     int len = 0;
 602
 603     switch ((ch1 = utf[0]) >> 4) {
 604         default: /* 1 byte */
 605                 (*utf_ptr)++;
 606                 return (u2) ch1;
 607         case 0xC:
 608         case 0xD: /* 2 bytes */
 609                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 610                         unsigned char high = ch1 & 0x1F;
 611                         unsigned char low  = ch2 & 0x3F;
 612                         unicode_char = (high << 6) + low;
 613                         len = 2;
 614                 }
 615                 break;
 616
 617         case 0xE: /* 2 or 3 bytes */
 618                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 619                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 620                                 unsigned char low  = ch3 & 0x3f;
 621                                 unsigned char mid  = ch2 & 0x3f;
 622                                 unsigned char high = ch1 & 0x0f;
 623                                 unicode_char = (((high << 6) + mid) << 6) + low;
 624                                 len = 3;
 625                         } else
 626                                 len = 2;
 627                 }
 628                 break;
 629     }
 630
 631     /* update position in utf-text */
 632     *utf_ptr = (char *) (utf + len);
 633
 634     return unicode_char;
 635 }
 636
 637
 638 /* utf_strlen ******************************************************************
 639
 640    Determine number of unicode characters in the utf string.
 641
 642 *******************************************************************************/
 643
 644 u4 utf_strlen(utf *u)
 645 {
 646         char *endpos;                       /* points behind utf string           */
 647         char *utf_ptr;                      /* current position in utf text       */
 648         u4 len = 0;                         /* number of unicode characters       */
 649
 650         if (!u) {
 651                 *exceptionptr = new_nullpointerexception();
 652                 return 0;
 653         }
 654
 655         endpos = utf_end(u);
 656         utf_ptr = u->text;
 657
 658         while (utf_ptr < endpos) {
 659                 len++;
 660                 /* next unicode character */
 661                 utf_nextu2(&utf_ptr);
 662         }
 663
 664         if (utf_ptr != endpos)
 665                 /* string ended abruptly */
 666                 throw_cacao_exception_exit(string_java_lang_InternalError,
 667                                                                    "Illegal utf8 string");
 668
 669         return len;
 670 }
 671
 672
 673 /* u2_utflength ****************************************************************
 674
 675    Returns the utf length in bytes of a u2 array.
 676
 677 *******************************************************************************/
 678
 679 u4 u2_utflength(u2 *text, u4 u2_length)
 680 {
 681         u4 result_len = 0;                  /* utf length in bytes                */
 682         u2 ch;                              /* current unicode character          */
 683         u4 len;
 684
 685         for (len = 0; len < u2_length; len++) {
 686                 /* next unicode character */
 687                 ch = *text++;
 688
 689                 /* determine bytes required to store unicode character as utf */
 690                 if (ch && (ch < 0x80))
 691                         result_len++;
 692                 else if (ch < 0x800)
 693                         result_len += 2;
 694                 else
 695                         result_len += 3;
 696         }
 697
 698     return result_len;
 699 }
 700
 701
 702 /* utf_display *****************************************************************
 703
 704    Write utf symbol to stdout (for debugging purposes).
 705
 706 *******************************************************************************/
 707
 708 void utf_display(utf *u)
 709 {
 710         char *endpos;                       /* points behind utf string           */
 711         char *utf_ptr;                      /* current position in utf text       */
 712
 713         if (!u) {
 714                 printf("NULL");
 715                 fflush(stdout);
 716                 return;
 717         }
 718
 719         endpos = utf_end(u);
 720         utf_ptr = u->text;
 721
 722         while (utf_ptr < endpos) {
 723                 /* read next unicode character */
 724                 u2 c = utf_nextu2(&utf_ptr);
 725                 if (c >= 32 && c <= 127) printf("%c", c);
 726                 else printf("?");
 727         }
 728
 729         fflush(stdout);
 730 }
 731
 732
 733 /* utf_display_classname *******************************************************
 734
 735    Write utf symbol to stdout with `/' converted to `.' (for debugging
 736    purposes).
 737
 738 *******************************************************************************/
 739
 740 void utf_display_classname(utf *u)
 741 {
 742         char *endpos;                       /* points behind utf string           */
 743         char *utf_ptr;                      /* current position in utf text       */
 744
 745         if (!u) {
 746                 printf("NULL");
 747                 fflush(stdout);
 748                 return;
 749         }
 750
 751         endpos = utf_end(u);
 752         utf_ptr = u->text;
 753
 754         while (utf_ptr < endpos) {
 755                 /* read next unicode character */
 756                 u2 c = utf_nextu2(&utf_ptr);
 757                 if (c == '/') c = '.';
 758                 if (c >= 32 && c <= 127) printf("%c", c);
 759                 else printf("?");
 760         }
 761
 762         fflush(stdout);
 763 }
 764
 765
 766 /* utf_sprint ******************************************************************
 767
 768    Write utf symbol into c-string (for debugging purposes).
 769
 770 *******************************************************************************/
 771
 772 void utf_sprint(char *buffer, utf *u)
 773 {
 774         char *endpos;                       /* points behind utf string           */
 775         char *utf_ptr;                      /* current position in utf text       */
 776         u2 pos = 0;                         /* position in c-string               */
 777
 778         if (!u) {
 779                 strcpy(buffer, "NULL");
 780                 return;
 781         }
 782
 783         endpos = utf_end(u);
 784         utf_ptr = u->text;
 785
 786         while (utf_ptr < endpos)
 787                 /* copy next unicode character */
 788                 buffer[pos++] = utf_nextu2(&utf_ptr);
 789
 790         /* terminate string */
 791         buffer[pos] = '\0';
 792 }
 793
 794
 795 /* utf_sprint_classname ********************************************************
 796
 797    Write utf symbol into c-string with `/' converted to `.' (for debugging
 798    purposes).
 799
 800 *******************************************************************************/
 801
 802 void utf_sprint_classname(char *buffer, utf *u)
 803 {
 804         char *endpos;                       /* points behind utf string           */
 805         char *utf_ptr;                      /* current position in utf text       */
 806         u2 pos = 0;                         /* position in c-string               */
 807
 808         if (!u) {
 809                 strcpy(buffer, "NULL");
 810                 return;
 811         }
 812
 813         endpos = utf_end(u);
 814         utf_ptr = u->text;
 815
 816         while (utf_ptr < endpos) {
 817                 /* copy next unicode character */
 818                 u2 c = utf_nextu2(&utf_ptr);
 819                 if (c == '/') c = '.';
 820                 buffer[pos++] = c;
 821         }
 822
 823         /* terminate string */
 824         buffer[pos] = '\0';
 825 }
 826
 827
 828 /* utf_strcat ******************************************************************
 829
 830    Like libc strcat, but uses an utf8 string.
 831
 832 *******************************************************************************/
 833
 834 void utf_strcat(char *buffer, utf *u)
 835 {
 836         utf_sprint(buffer + strlen(buffer), u);
 837 }
 838
 839
 840 /* utf_strcat_classname ********************************************************
 841
 842    Like libc strcat, but uses an utf8 string.
 843
 844 *******************************************************************************/
 845
 846 void utf_strcat_classname(char *buffer, utf *u)
 847 {
 848         utf_sprint_classname(buffer + strlen(buffer), u);
 849 }
 850
 851
 852 /* utf_fprint ******************************************************************
 853
 854    Write utf symbol into file.
 855
 856 *******************************************************************************/
 857
 858 void utf_fprint(FILE *file, utf *u)
 859 {
 860         char *endpos;                       /* points behind utf string           */
 861         char *utf_ptr;                      /* current position in utf text       */
 862
 863         if (!u)
 864                 return;
 865
 866         endpos = utf_end(u);
 867         utf_ptr = u->text;
 868
 869         while (utf_ptr < endpos) {
 870                 /* read next unicode character */
 871                 u2 c = utf_nextu2(&utf_ptr);
 872
 873                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 874                 else fprintf(file, "?");
 875         }
 876 }
 877
 878
 879 /* utf_fprint_classname ********************************************************
 880
 881    Write utf symbol into file with `/' converted to `.'.
 882
 883 *******************************************************************************/
 884
 885 void utf_fprint_classname(FILE *file, utf *u)
 886 {
 887         char *endpos;                       /* points behind utf string           */
 888         char *utf_ptr;                      /* current position in utf text       */
 889
 890     if (!u)
 891                 return;
 892
 893         endpos = utf_end(u);
 894         utf_ptr = u->text;
 895
 896         while (utf_ptr < endpos) {
 897                 /* read next unicode character */
 898                 u2 c = utf_nextu2(&utf_ptr);
 899                 if (c == '/') c = '.';
 900
 901                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 902                 else fprintf(file, "?");
 903         }
 904 }
 905
 906
 907 /* is_valid_utf ****************************************************************
 908
 909    Return true if the given string is a valid UTF-8 string.
 910
 911    utf_ptr...points to first character
 912    end_pos...points after last character
 913
 914 *******************************************************************************/
 915
 916 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 917
 918 bool is_valid_utf(char *utf_ptr, char *end_pos)
 919 {
 920         int bytes;
 921         int len,i;
 922         char c;
 923         unsigned long v;
 924
 925         if (end_pos < utf_ptr) return false;
 926         bytes = end_pos - utf_ptr;
 927         while (bytes--) {
 928                 c = *utf_ptr++;
 929
 930                 if (!c) return false;                     /* 0x00 is not allowed */
 931                 if ((c & 0x80) == 0) continue;            /* ASCII */
 932
 933                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 934                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 935                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 936                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 937                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 938                 else return false;                        /* invalid leading byte */
 939
 940                 if (len > 2) return false;                /* Java limitation */
 941
 942                 v = (unsigned long)c & (0x3f >> len);
 943
 944                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 945
 946                 for (i = len; i--; ) {
 947                         c = *utf_ptr++;
 948                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 949                                 return false;
 950                         v = (v << 6) | (c & 0x3f);
 951                 }
 952
 953                 if (v == 0) {
 954                         if (len != 1) return false;           /* Java special */
 955
 956                 } else {
 957                         /* Sun Java seems to allow overlong UTF-8 encodings */
 958
 959                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 960                                 if (!opt_liberalutf)
 961                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 962                                 /* XXX change this to panic? */
 963                         }
 964                 }
 965
 966                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 967                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 968
 969                 /* even these seem to be allowed */
 970                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 971         }
 972
 973         return true;
 974 }
 975
 976
 977 /* is_valid_name ***************************************************************
 978
 979    Return true if the given string may be used as a class/field/method
 980    name. (Currently this only disallows empty strings and control
 981    characters.)
 982
 983    NOTE: The string is assumed to have passed is_valid_utf!
 984
 985    utf_ptr...points to first character
 986    end_pos...points after last character
 987
 988 *******************************************************************************/
 989
 990 bool is_valid_name(char *utf_ptr, char *end_pos)
 991 {
 992         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 993
 994         while (utf_ptr < end_pos) {
 995                 unsigned char c = *utf_ptr++;
 996
 997                 if (c < 0x20) return false; /* disallow control characters */
 998                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
 999                         return false;
1000         }
1001
1002         return true;
1003 }
1004
1005 bool is_valid_name_utf(utf *u)
1006 {
1007         return is_valid_name(u->text,utf_end(u));
1008 }
1009
1010
1011 /* utf_show ********************************************************************
1012
1013    Writes the utf symbols in the utfhash to stdout and displays the
1014    number of external hash chains grouped according to the chainlength
1015    (for debugging purposes).
1016
1017 *******************************************************************************/
1018
1019 void utf_show(void)
1020 {
1021
1022 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1023
1024         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1025         u4 max_chainlength = 0;      /* maximum length of the chains */
1026         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1027         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1028         u4 i;
1029
1030         printf ("UTF-HASH:\n");
1031
1032         /* show element of utf-hashtable */
1033         for (i=0; i<utf_hash.size; i++) {
1034                 utf *u = utf_hash.ptr[i];
1035                 if (u) {
1036                         printf ("SLOT %d: ", (int) i);
1037                         while (u) {
1038                                 printf ("'");
1039                                 utf_display (u);
1040                                 printf ("' ");
1041                                 u = u->hashlink;
1042                         }
1043                         printf ("\n");
1044                 }
1045
1046         }
1047
1048         printf ("UTF-HASH: %d slots for %d entries\n",
1049                         (int) utf_hash.size, (int) utf_hash.entries );
1050
1051
1052         if (utf_hash.entries == 0)
1053                 return;
1054
1055         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1056
1057         for (i=0;i<CHAIN_LIMIT;i++)
1058                 chain_count[i]=0;
1059
1060         /* count numbers of hashchains according to their length */
1061         for (i=0; i<utf_hash.size; i++) {
1062
1063                 utf *u = (utf*) utf_hash.ptr[i];
1064                 u4 chain_length = 0;
1065
1066                 /* determine chainlength */
1067                 while (u) {
1068                         u = u->hashlink;
1069                         chain_length++;
1070                 }
1071
1072                 /* update sum of all chainlengths */
1073                 sum_chainlength+=chain_length;
1074
1075                 /* determine the maximum length of the chains */
1076                 if (chain_length>max_chainlength)
1077                         max_chainlength = chain_length;
1078
1079                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1080                 if (chain_length>=CHAIN_LIMIT) {
1081                         beyond_limit+=chain_length;
1082                         chain_length=CHAIN_LIMIT-1;
1083                 }
1084
1085                 /* update number of hashchains of current length */
1086                 chain_count[chain_length]++;
1087         }
1088
1089         /* display results */
1090         for (i=1;i<CHAIN_LIMIT-1;i++)
1091                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1092
1093         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1094
1095
1096         printf("max. chainlength:%5d\n",max_chainlength);
1097
1098         /* avg. chainlength = sum of chainlengths / number of chains */
1099         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1100 }
1101
1102
1103 /*
1104  * These are local overrides for various environment variables in Emacs.
1105  * Please do not remove this and leave it at the end of the file, where
1106  * Emacs will automagically detect them.
1107  * ---------------------------------------------------------------------
1108  * Local variables:
1109  * mode: c
1110  * indent-tabs-mode: t
1111  * c-basic-offset: 4
1112  * tab-width: 4
1113  * End:
1114  */