src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2458 2005-05-12 23:02:07Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/stringlocal.h"
  45 #include "vm/tables.h"
  46 #include "vm/utf8.h"
  47
  48
  49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  50
  51
  52 /* utf-symbols for pointer comparison of frequently used strings **************/
  53
  54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  55
  56 utf *utf_java_lang_Class;
  57 utf *utf_java_lang_ClassLoader;
  58 utf *utf_java_lang_Cloneable;
  59 utf *utf_java_lang_SecurityManager;
  60 utf *utf_java_lang_String;
  61 utf *utf_java_lang_System;
  62 utf *utf_java_lang_ThreadGroup;
  63 utf *utf_java_io_Serializable;
  64
  65 utf *utf_java_lang_Throwable;
  66 utf *utf_java_lang_VMThrowable;
  67 utf *utf_java_lang_Error;
  68 utf *utf_java_lang_Exception;
  69 utf *utf_java_lang_NoClassDefFoundError;
  70 utf *utf_java_lang_OutOfMemoryError;
  71 utf *utf_java_lang_ClassNotFoundException;
  72
  73 utf* utf_java_lang_Void;
  74 utf* utf_java_lang_Boolean;
  75 utf* utf_java_lang_Byte;
  76 utf* utf_java_lang_Character;
  77 utf* utf_java_lang_Short;
  78 utf* utf_java_lang_Integer;
  79 utf* utf_java_lang_Long;
  80 utf* utf_java_lang_Float;
  81 utf* utf_java_lang_Double;
  82
  83 utf *utf_java_util_Vector;
  84 utf *utf_java_lang_reflect_Constructor;
  85 utf *utf_java_lang_reflect_Method;
  86
  87
  88 utf *utf_InnerClasses;                  /* InnerClasses                       */
  89 utf *utf_ConstantValue;                 /* ConstantValue                      */
  90 utf *utf_Code;                          /* Code                               */
  91 utf *utf_Exceptions;                    /* Exceptions                         */
  92 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  93 utf *utf_SourceFile;                    /* SourceFile                         */
  94
  95 utf *utf_init;                          /* <init>                             */
  96 utf *utf_clinit;                        /* <clinit>                           */
  97 utf *utf_finalize;                      /* finalize                           */
  98
  99 utf *utf_printStackTrace;
 100 utf *utf_fillInStackTrace;
 101 utf *utf_loadClass;
 102
 103 utf *utf_void__void;                    /* ()V                                */
 104 utf *utf_boolean__void;                 /* (Z)V                               */
 105 utf *utf_byte__void;                    /* (B)V                               */
 106 utf *utf_char__void;                    /* (C)V                               */
 107 utf *utf_short__void;                   /* (S)V                               */
 108 utf *utf_int__void;                     /* (I)V                               */
 109 utf *utf_long__void;                    /* (J)V                               */
 110 utf *utf_float__void;                   /* (F)V                               */
 111 utf *utf_double__void;                  /* (D)V                               */
 112 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 113 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 114 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 115 utf *utf_java_lang_String__java_lang_Class;
 116 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 117
 118 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 119
 120 utf *array_packagename;
 121
 122
 123 /* utf_init ********************************************************************
 124
 125    Initializes the utf8 subsystem.
 126
 127 *******************************************************************************/
 128
 129 void utf8_init(void)
 130 {
 131         /* create utf-symbols for pointer comparison of frequently used strings */
 132
 133         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 134
 135         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 136         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 137         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 138         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 139         utf_java_lang_String           = utf_new_char("java/lang/String");
 140         utf_java_lang_System           = utf_new_char("java/lang/System");
 141         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 142         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 143
 144         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 145         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 146         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 147         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 148
 149         utf_java_lang_NoClassDefFoundError =
 150                 utf_new_char(string_java_lang_NoClassDefFoundError);
 151
 152         utf_java_lang_OutOfMemoryError =
 153                 utf_new_char(string_java_lang_OutOfMemoryError);
 154
 155         utf_java_lang_ClassNotFoundException =
 156                 utf_new_char(string_java_lang_ClassNotFoundException);
 157
 158         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 159         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 160         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 161         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 162         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 163         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 164         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 165         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 166         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 167
 168         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 169         utf_java_lang_reflect_Constructor = utf_new_char("java/lang/reflect/Constructor");
 170         utf_java_lang_reflect_Method      = utf_new_char("java/lang/reflect/Method");
 171
 172         utf_InnerClasses               = utf_new_char("InnerClasses");
 173         utf_ConstantValue              = utf_new_char("ConstantValue");
 174         utf_Code                       = utf_new_char("Code");
 175         utf_Exceptions                 = utf_new_char("Exceptions");
 176         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 177         utf_SourceFile                 = utf_new_char("SourceFile");
 178
 179         utf_init                           = utf_new_char("<init>");
 180         utf_clinit                         = utf_new_char("<clinit>");
 181         utf_finalize                   = utf_new_char("finalize");
 182
 183         utf_printStackTrace            = utf_new_char("printStackTrace");
 184         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 185         utf_loadClass                  = utf_new_char("loadClass");
 186
 187         utf_void__void                 = utf_new_char("()V");
 188         utf_boolean__void              = utf_new_char("(Z)V");
 189         utf_byte__void                 = utf_new_char("(B)V");
 190         utf_char__void                 = utf_new_char("(C)V");
 191         utf_short__void                = utf_new_char("(S)V");
 192         utf_int__void                  = utf_new_char("(I)V");
 193         utf_long__void                 = utf_new_char("(J)V");
 194         utf_float__void                = utf_new_char("(F)V");
 195         utf_double__void               = utf_new_char("(D)V");
 196         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 197         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 198         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 199
 200         utf_java_lang_String__java_lang_Class =
 201                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 202
 203         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 204
 205         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 206
 207         array_packagename              = utf_new_char("\t<the array package>");
 208 }
 209
 210
 211 /* utf_hashkey *****************************************************************
 212
 213    The hashkey is computed from the utf-text by using up to 8
 214    characters.  For utf-symbols longer than 15 characters 3 characters
 215    are taken from the beginning and the end, 2 characters are taken
 216    from the middle.
 217
 218 *******************************************************************************/
 219
 220 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 221 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 222
 223 u4 utf_hashkey(const char *text, u4 length)
 224 {
 225         const char *start_pos = text;       /* pointer to utf text                */
 226         u4 a;
 227
 228         switch (length) {
 229         case 0: /* empty string */
 230                 return 0;
 231
 232         case 1: return fbs(0);
 233         case 2: return fbs(0) ^ nbs(3);
 234         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 235         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 236         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 237         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 238         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 239         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 240
 241         case 9:
 242                 a = fbs(0);
 243                 a ^= nbs(1);
 244                 a ^= nbs(2);
 245                 text++;
 246                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 247
 248         case 10:
 249                 a = fbs(0);
 250                 text++;
 251                 a ^= nbs(2);
 252                 a ^= nbs(3);
 253                 a ^= nbs(4);
 254                 text++;
 255                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 256
 257         case 11:
 258                 a = fbs(0);
 259                 text++;
 260                 a ^= nbs(2);
 261                 a ^= nbs(3);
 262                 a ^= nbs(4);
 263                 text++;
 264                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 265
 266         case 12:
 267                 a = fbs(0);
 268                 text += 2;
 269                 a ^= nbs(2);
 270                 a ^= nbs(3);
 271                 text++;
 272                 a ^= nbs(5);
 273                 a ^= nbs(6);
 274                 a ^= nbs(7);
 275                 text++;
 276                 return a ^ nbs(9) ^ nbs(10);
 277
 278         case 13:
 279                 a = fbs(0);
 280                 a ^= nbs(1);
 281                 text++;
 282                 a ^= nbs(3);
 283                 a ^= nbs(4);
 284                 text += 2;
 285                 a ^= nbs(7);
 286                 a ^= nbs(8);
 287                 text += 2;
 288                 return a ^ nbs(9) ^ nbs(10);
 289
 290         case 14:
 291                 a = fbs(0);
 292                 text += 2;
 293                 a ^= nbs(3);
 294                 a ^= nbs(4);
 295                 text += 2;
 296                 a ^= nbs(7);
 297                 a ^= nbs(8);
 298                 text += 2;
 299                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 300
 301         case 15:
 302                 a = fbs(0);
 303                 text += 2;
 304                 a ^= nbs(3);
 305                 a ^= nbs(4);
 306                 text += 2;
 307                 a ^= nbs(7);
 308                 a ^= nbs(8);
 309                 text += 2;
 310                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 311
 312         default:  /* 3 characters from beginning */
 313                 a = fbs(0);
 314                 text += 2;
 315                 a ^= nbs(3);
 316                 a ^= nbs(4);
 317
 318                 /* 2 characters from middle */
 319                 text = start_pos + (length / 2);
 320                 a ^= fbs(5);
 321                 text += 2;
 322                 a ^= nbs(6);
 323
 324                 /* 3 characters from end */
 325                 text = start_pos + length - 4;
 326
 327                 a ^= fbs(7);
 328                 text++;
 329
 330                 return a ^ nbs(10) ^ nbs(11);
 331     }
 332 }
 333
 334
 335 /* utf_hashkey *****************************************************************
 336
 337    Compute the hashkey of a unicode string.
 338
 339 *******************************************************************************/
 340
 341 u4 unicode_hashkey(u2 *text, u2 len)
 342 {
 343         return utf_hashkey((char *) text, len);
 344 }
 345
 346
 347 /* utf_new *********************************************************************
 348
 349    Creates a new utf-symbol, the text of the symbol is passed as a
 350    u1-array. The function searches the utf-hashtable for a utf-symbol
 351    with this text. On success the element returned, otherwise a new
 352    hashtable element is created.
 353
 354    If the number of entries in the hashtable exceeds twice the size of
 355    the hashtable slots a reorganization of the hashtable is done and
 356    the utf symbols are copied to a new hashtable with doubled size.
 357
 358 *******************************************************************************/
 359
 360 utf *utf_new_intern(const char *text, u2 length);
 361
 362 utf *utf_new(const char *text, u2 length)
 363 {
 364     utf *r;
 365
 366 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 367     tables_lock();
 368 #endif
 369
 370     r = utf_new_intern(text, length);
 371
 372 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 373     tables_unlock();
 374 #endif
 375
 376     return r;
 377 }
 378
 379
 380 utf *utf_new_intern(const char *text, u2 length)
 381 {
 382         u4 key;                             /* hashkey computed from utf-text     */
 383         u4 slot;                            /* slot in hashtable                  */
 384         utf *u;                             /* hashtable element                  */
 385         u2 i;
 386
 387 #ifdef STATISTICS
 388         if (opt_stat)
 389                 count_utf_new++;
 390 #endif
 391
 392         key  = utf_hashkey(text, length);
 393         slot = key & (utf_hash.size - 1);
 394         u    = utf_hash.ptr[slot];
 395
 396         /* search external hash chain for utf-symbol */
 397         while (u) {
 398                 if (u->blength == length) {
 399
 400                         /* compare text of hashtable elements */
 401                         for (i = 0; i < length; i++)
 402                                 if (text[i] != u->text[i]) goto nomatch;
 403
 404 #ifdef STATISTICS
 405                         if (opt_stat)
 406                                 count_utf_new_found++;
 407 #endif
 408
 409                         /* symbol found in hashtable */
 410                         return u;
 411                 }
 412         nomatch:
 413                 u = u->hashlink; /* next element in external chain */
 414         }
 415
 416 #ifdef STATISTICS
 417         if (opt_stat)
 418                 count_utf_len += sizeof(utf) + length;
 419 #endif
 420
 421         /* location in hashtable found, create new utf element */
 422         u = NEW(utf);
 423         u->blength  = length;               /* length in bytes of utfstring       */
 424         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 425         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 426         memcpy(u->text, text, length);      /* copy utf-text                      */
 427         u->text[length] = '\0';
 428         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 429
 430         utf_hash.entries++;                 /* update number of entries           */
 431
 432         if (utf_hash.entries > (utf_hash.size * 2)) {
 433
 434         /* reorganization of hashtable, average length of
 435            the external chains is approx. 2                */
 436
 437                 u4 i;
 438                 utf *u;
 439                 hashtable newhash; /* the new hashtable */
 440
 441                 /* create new hashtable, double the size */
 442                 init_hashtable(&newhash, utf_hash.size * 2);
 443                 newhash.entries = utf_hash.entries;
 444
 445 #ifdef STATISTICS
 446                 if (opt_stat)
 447                         count_utf_len += sizeof(utf*) * utf_hash.size;
 448 #endif
 449
 450                 /* transfer elements to new hashtable */
 451                 for (i = 0; i < utf_hash.size; i++) {
 452                         u = (utf *) utf_hash.ptr[i];
 453                         while (u) {
 454                                 utf *nextu = u->hashlink;
 455                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 456
 457                                 u->hashlink = (utf *) newhash.ptr[slot];
 458                                 newhash.ptr[slot] = u;
 459
 460                                 /* follow link in external hash chain */
 461                                 u = nextu;
 462                         }
 463                 }
 464
 465                 /* dispose old table */
 466                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 467                 utf_hash = newhash;
 468         }
 469
 470         return u;
 471 }
 472
 473
 474 /* utf_new_u2 ******************************************************************
 475
 476    Make utf symbol from u2 array, if isclassname is true '.' is
 477    replaced by '/'.
 478
 479 *******************************************************************************/
 480
 481 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 482 {
 483         char *buffer;                   /* memory buffer for  unicode characters  */
 484         char *pos;                      /* pointer to current position in buffer  */
 485         u4 left;                        /* unicode characters left                */
 486         u4 buflength;                   /* utf length in bytes of the u2 array    */
 487         utf *result;                    /* resulting utf-string                   */
 488         int i;
 489
 490         /* determine utf length in bytes and allocate memory */
 491
 492         buflength = u2_utflength(unicode_pos, unicode_length);
 493         buffer    = MNEW(char, buflength);
 494
 495         left = buflength;
 496         pos  = buffer;
 497
 498         for (i = 0; i++ < unicode_length; unicode_pos++) {
 499                 /* next unicode character */
 500                 u2 c = *unicode_pos;
 501
 502                 if ((c != 0) && (c < 0x80)) {
 503                         /* 1 character */
 504                         left--;
 505                 if ((int) left < 0) break;
 506                         /* convert classname */
 507                         if (isclassname && c == '.')
 508                                 *pos++ = '/';
 509                         else
 510                                 *pos++ = (char) c;
 511
 512                 } else if (c < 0x800) {
 513                         /* 2 characters */
 514                 unsigned char high = c >> 6;
 515                 unsigned char low  = c & 0x3F;
 516                         left = left - 2;
 517                 if ((int) left < 0) break;
 518                 *pos++ = high | 0xC0;
 519                 *pos++ = low  | 0x80;
 520
 521                 } else {
 522                 /* 3 characters */
 523                 char low  = c & 0x3f;
 524                 char mid  = (c >> 6) & 0x3F;
 525                 char high = c >> 12;
 526                         left = left - 3;
 527                 if ((int) left < 0) break;
 528                 *pos++ = high | 0xE0;
 529                 *pos++ = mid  | 0x80;
 530                 *pos++ = low  | 0x80;
 531                 }
 532         }
 533
 534         /* insert utf-string into symbol-table */
 535         result = utf_new(buffer,buflength);
 536
 537         MFREE(buffer, char, buflength);
 538
 539         return result;
 540 }
 541
 542
 543 /* utf_new_char ****************************************************************
 544
 545    Creates a new utf symbol, the text for this symbol is passed as a
 546    c-string ( = char* ).
 547
 548 *******************************************************************************/
 549
 550 utf *utf_new_char(const char *text)
 551 {
 552         return utf_new(text, strlen(text));
 553 }
 554
 555
 556 /* utf_new_char_classname ******************************************************
 557
 558    Creates a new utf symbol, the text for this symbol is passed as a
 559    c-string ( = char* ) "." characters are going to be replaced by
 560    "/". Since the above function is used often, this is a separte
 561    function, instead of an if.
 562
 563 *******************************************************************************/
 564
 565 utf *utf_new_char_classname(const char *text)
 566 {
 567         if (strchr(text, '.')) {
 568                 char *txt = strdup(text);
 569                 char *end = txt + strlen(txt);
 570                 char *c;
 571                 utf *tmpRes;
 572
 573                 for (c = txt; c < end; c++)
 574                         if (*c == '.') *c = '/';
 575
 576                 tmpRes = utf_new(txt, strlen(txt));
 577                 FREE(txt, 0);
 578
 579                 return tmpRes;
 580
 581         } else
 582                 return utf_new(text, strlen(text));
 583 }
 584
 585
 586 /* utf_nextu2 ******************************************************************
 587
 588    Read the next unicode character from the utf string and increment
 589    the utf-string pointer accordingly.
 590
 591 *******************************************************************************/
 592
 593 u2 utf_nextu2(char **utf_ptr)
 594 {
 595     /* uncompressed unicode character */
 596     u2 unicode_char = 0;
 597     /* current position in utf text */
 598     unsigned char *utf = (unsigned char *) (*utf_ptr);
 599     /* bytes representing the unicode character */
 600     unsigned char ch1, ch2, ch3;
 601     /* number of bytes used to represent the unicode character */
 602     int len = 0;
 603
 604     switch ((ch1 = utf[0]) >> 4) {
 605         default: /* 1 byte */
 606                 (*utf_ptr)++;
 607                 return (u2) ch1;
 608         case 0xC:
 609         case 0xD: /* 2 bytes */
 610                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 611                         unsigned char high = ch1 & 0x1F;
 612                         unsigned char low  = ch2 & 0x3F;
 613                         unicode_char = (high << 6) + low;
 614                         len = 2;
 615                 }
 616                 break;
 617
 618         case 0xE: /* 2 or 3 bytes */
 619                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 620                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 621                                 unsigned char low  = ch3 & 0x3f;
 622                                 unsigned char mid  = ch2 & 0x3f;
 623                                 unsigned char high = ch1 & 0x0f;
 624                                 unicode_char = (((high << 6) + mid) << 6) + low;
 625                                 len = 3;
 626                         } else
 627                                 len = 2;
 628                 }
 629                 break;
 630     }
 631
 632     /* update position in utf-text */
 633     *utf_ptr = (char *) (utf + len);
 634
 635     return unicode_char;
 636 }
 637
 638
 639 /* utf_strlen ******************************************************************
 640
 641    Determine number of unicode characters in the utf string.
 642
 643 *******************************************************************************/
 644
 645 u4 utf_strlen(utf *u)
 646 {
 647         char *endpos;                       /* points behind utf string           */
 648         char *utf_ptr;                      /* current position in utf text       */
 649         u4 len = 0;                         /* number of unicode characters       */
 650
 651         if (!u) {
 652                 *exceptionptr = new_nullpointerexception();
 653                 return 0;
 654         }
 655
 656         endpos = utf_end(u);
 657         utf_ptr = u->text;
 658
 659         while (utf_ptr < endpos) {
 660                 len++;
 661                 /* next unicode character */
 662                 utf_nextu2(&utf_ptr);
 663         }
 664
 665         if (utf_ptr != endpos)
 666                 /* string ended abruptly */
 667                 throw_cacao_exception_exit(string_java_lang_InternalError,
 668                                                                    "Illegal utf8 string");
 669
 670         return len;
 671 }
 672
 673
 674 /* u2_utflength ****************************************************************
 675
 676    Returns the utf length in bytes of a u2 array.
 677
 678 *******************************************************************************/
 679
 680 u4 u2_utflength(u2 *text, u4 u2_length)
 681 {
 682         u4 result_len = 0;                  /* utf length in bytes                */
 683         u2 ch;                              /* current unicode character          */
 684         u4 len;
 685
 686         for (len = 0; len < u2_length; len++) {
 687                 /* next unicode character */
 688                 ch = *text++;
 689
 690                 /* determine bytes required to store unicode character as utf */
 691                 if (ch && (ch < 0x80))
 692                         result_len++;
 693                 else if (ch < 0x800)
 694                         result_len += 2;
 695                 else
 696                         result_len += 3;
 697         }
 698
 699     return result_len;
 700 }
 701
 702
 703 /* utf_display *****************************************************************
 704
 705    Write utf symbol to stdout (for debugging purposes).
 706
 707 *******************************************************************************/
 708
 709 void utf_display(utf *u)
 710 {
 711         char *endpos;                       /* points behind utf string           */
 712         char *utf_ptr;                      /* current position in utf text       */
 713
 714         if (!u) {
 715                 printf("NULL");
 716                 fflush(stdout);
 717                 return;
 718         }
 719
 720         endpos = utf_end(u);
 721         utf_ptr = u->text;
 722
 723         while (utf_ptr < endpos) {
 724                 /* read next unicode character */
 725                 u2 c = utf_nextu2(&utf_ptr);
 726                 if (c >= 32 && c <= 127) printf("%c", c);
 727                 else printf("?");
 728         }
 729
 730         fflush(stdout);
 731 }
 732
 733
 734 /* utf_display_classname *******************************************************
 735
 736    Write utf symbol to stdout with `/' converted to `.' (for debugging
 737    purposes).
 738
 739 *******************************************************************************/
 740
 741 void utf_display_classname(utf *u)
 742 {
 743         char *endpos;                       /* points behind utf string           */
 744         char *utf_ptr;                      /* current position in utf text       */
 745
 746         if (!u) {
 747                 printf("NULL");
 748                 fflush(stdout);
 749                 return;
 750         }
 751
 752         endpos = utf_end(u);
 753         utf_ptr = u->text;
 754
 755         while (utf_ptr < endpos) {
 756                 /* read next unicode character */
 757                 u2 c = utf_nextu2(&utf_ptr);
 758                 if (c == '/') c = '.';
 759                 if (c >= 32 && c <= 127) printf("%c", c);
 760                 else printf("?");
 761         }
 762
 763         fflush(stdout);
 764 }
 765
 766
 767 /* utf_sprint ******************************************************************
 768
 769    Write utf symbol into c-string (for debugging purposes).
 770
 771 *******************************************************************************/
 772
 773 void utf_sprint(char *buffer, utf *u)
 774 {
 775         char *endpos;                       /* points behind utf string           */
 776         char *utf_ptr;                      /* current position in utf text       */
 777         u2 pos = 0;                         /* position in c-string               */
 778
 779         if (!u) {
 780                 strcpy(buffer, "NULL");
 781                 return;
 782         }
 783
 784         endpos = utf_end(u);
 785         utf_ptr = u->text;
 786
 787         while (utf_ptr < endpos)
 788                 /* copy next unicode character */
 789                 buffer[pos++] = utf_nextu2(&utf_ptr);
 790
 791         /* terminate string */
 792         buffer[pos] = '\0';
 793 }
 794
 795
 796 /* utf_sprint_classname ********************************************************
 797
 798    Write utf symbol into c-string with `/' converted to `.' (for debugging
 799    purposes).
 800
 801 *******************************************************************************/
 802
 803 void utf_sprint_classname(char *buffer, utf *u)
 804 {
 805         char *endpos;                       /* points behind utf string           */
 806         char *utf_ptr;                      /* current position in utf text       */
 807         u2 pos = 0;                         /* position in c-string               */
 808
 809         if (!u) {
 810                 strcpy(buffer, "NULL");
 811                 return;
 812         }
 813
 814         endpos = utf_end(u);
 815         utf_ptr = u->text;
 816
 817         while (utf_ptr < endpos) {
 818                 /* copy next unicode character */
 819                 u2 c = utf_nextu2(&utf_ptr);
 820                 if (c == '/') c = '.';
 821                 buffer[pos++] = c;
 822         }
 823
 824         /* terminate string */
 825         buffer[pos] = '\0';
 826 }
 827
 828
 829 /* utf_strcat ******************************************************************
 830
 831    Like libc strcat, but uses an utf8 string.
 832
 833 *******************************************************************************/
 834
 835 void utf_strcat(char *buffer, utf *u)
 836 {
 837         utf_sprint(buffer + strlen(buffer), u);
 838 }
 839
 840
 841 /* utf_strcat_classname ********************************************************
 842
 843    Like libc strcat, but uses an utf8 string.
 844
 845 *******************************************************************************/
 846
 847 void utf_strcat_classname(char *buffer, utf *u)
 848 {
 849         utf_sprint_classname(buffer + strlen(buffer), u);
 850 }
 851
 852
 853 /* utf_fprint ******************************************************************
 854
 855    Write utf symbol into file.
 856
 857 *******************************************************************************/
 858
 859 void utf_fprint(FILE *file, utf *u)
 860 {
 861         char *endpos;                       /* points behind utf string           */
 862         char *utf_ptr;                      /* current position in utf text       */
 863
 864         if (!u)
 865                 return;
 866
 867         endpos = utf_end(u);
 868         utf_ptr = u->text;
 869
 870         while (utf_ptr < endpos) {
 871                 /* read next unicode character */
 872                 u2 c = utf_nextu2(&utf_ptr);
 873
 874                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 875                 else fprintf(file, "?");
 876         }
 877 }
 878
 879
 880 /* utf_fprint_classname ********************************************************
 881
 882    Write utf symbol into file with `/' converted to `.'.
 883
 884 *******************************************************************************/
 885
 886 void utf_fprint_classname(FILE *file, utf *u)
 887 {
 888         char *endpos;                       /* points behind utf string           */
 889         char *utf_ptr;                      /* current position in utf text       */
 890
 891     if (!u)
 892                 return;
 893
 894         endpos = utf_end(u);
 895         utf_ptr = u->text;
 896
 897         while (utf_ptr < endpos) {
 898                 /* read next unicode character */
 899                 u2 c = utf_nextu2(&utf_ptr);
 900                 if (c == '/') c = '.';
 901
 902                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 903                 else fprintf(file, "?");
 904         }
 905 }
 906
 907
 908 /* is_valid_utf ****************************************************************
 909
 910    Return true if the given string is a valid UTF-8 string.
 911
 912    utf_ptr...points to first character
 913    end_pos...points after last character
 914
 915 *******************************************************************************/
 916
 917 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 918
 919 bool is_valid_utf(char *utf_ptr, char *end_pos)
 920 {
 921         int bytes;
 922         int len,i;
 923         char c;
 924         unsigned long v;
 925
 926         if (end_pos < utf_ptr) return false;
 927         bytes = end_pos - utf_ptr;
 928         while (bytes--) {
 929                 c = *utf_ptr++;
 930
 931                 if (!c) return false;                     /* 0x00 is not allowed */
 932                 if ((c & 0x80) == 0) continue;            /* ASCII */
 933
 934                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 935                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 936                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 937                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 938                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 939                 else return false;                        /* invalid leading byte */
 940
 941                 if (len > 2) return false;                /* Java limitation */
 942
 943                 v = (unsigned long)c & (0x3f >> len);
 944
 945                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 946
 947                 for (i = len; i--; ) {
 948                         c = *utf_ptr++;
 949                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 950                                 return false;
 951                         v = (v << 6) | (c & 0x3f);
 952                 }
 953
 954                 if (v == 0) {
 955                         if (len != 1) return false;           /* Java special */
 956
 957                 } else {
 958                         /* Sun Java seems to allow overlong UTF-8 encodings */
 959
 960                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 961                                 if (!opt_liberalutf)
 962                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 963                                 /* XXX change this to panic? */
 964                         }
 965                 }
 966
 967                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 968                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 969
 970                 /* even these seem to be allowed */
 971                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 972         }
 973
 974         return true;
 975 }
 976
 977
 978 /* is_valid_name ***************************************************************
 979
 980    Return true if the given string may be used as a class/field/method
 981    name. (Currently this only disallows empty strings and control
 982    characters.)
 983
 984    NOTE: The string is assumed to have passed is_valid_utf!
 985
 986    utf_ptr...points to first character
 987    end_pos...points after last character
 988
 989 *******************************************************************************/
 990
 991 bool is_valid_name(char *utf_ptr, char *end_pos)
 992 {
 993         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 994
 995         while (utf_ptr < end_pos) {
 996                 unsigned char c = *utf_ptr++;
 997
 998                 if (c < 0x20) return false; /* disallow control characters */
 999                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1000                         return false;
1001         }
1002
1003         return true;
1004 }
1005
1006 bool is_valid_name_utf(utf *u)
1007 {
1008         return is_valid_name(u->text,utf_end(u));
1009 }
1010
1011
1012 /* utf_show ********************************************************************
1013
1014    Writes the utf symbols in the utfhash to stdout and displays the
1015    number of external hash chains grouped according to the chainlength
1016    (for debugging purposes).
1017
1018 *******************************************************************************/
1019
1020 void utf_show(void)
1021 {
1022
1023 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1024
1025         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1026         u4 max_chainlength = 0;      /* maximum length of the chains */
1027         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1028         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1029         u4 i;
1030
1031         printf ("UTF-HASH:\n");
1032
1033         /* show element of utf-hashtable */
1034         for (i=0; i<utf_hash.size; i++) {
1035                 utf *u = utf_hash.ptr[i];
1036                 if (u) {
1037                         printf ("SLOT %d: ", (int) i);
1038                         while (u) {
1039                                 printf ("'");
1040                                 utf_display (u);
1041                                 printf ("' ");
1042                                 u = u->hashlink;
1043                         }
1044                         printf ("\n");
1045                 }
1046
1047         }
1048
1049         printf ("UTF-HASH: %d slots for %d entries\n",
1050                         (int) utf_hash.size, (int) utf_hash.entries );
1051
1052
1053         if (utf_hash.entries == 0)
1054                 return;
1055
1056         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1057
1058         for (i=0;i<CHAIN_LIMIT;i++)
1059                 chain_count[i]=0;
1060
1061         /* count numbers of hashchains according to their length */
1062         for (i=0; i<utf_hash.size; i++) {
1063
1064                 utf *u = (utf*) utf_hash.ptr[i];
1065                 u4 chain_length = 0;
1066
1067                 /* determine chainlength */
1068                 while (u) {
1069                         u = u->hashlink;
1070                         chain_length++;
1071                 }
1072
1073                 /* update sum of all chainlengths */
1074                 sum_chainlength+=chain_length;
1075
1076                 /* determine the maximum length of the chains */
1077                 if (chain_length>max_chainlength)
1078                         max_chainlength = chain_length;
1079
1080                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1081                 if (chain_length>=CHAIN_LIMIT) {
1082                         beyond_limit+=chain_length;
1083                         chain_length=CHAIN_LIMIT-1;
1084                 }
1085
1086                 /* update number of hashchains of current length */
1087                 chain_count[chain_length]++;
1088         }
1089
1090         /* display results */
1091         for (i=1;i<CHAIN_LIMIT-1;i++)
1092                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1093
1094         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1095
1096
1097         printf("max. chainlength:%5d\n",max_chainlength);
1098
1099         /* avg. chainlength = sum of chainlengths / number of chains */
1100         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1101 }
1102
1103
1104 /*
1105  * These are local overrides for various environment variables in Emacs.
1106  * Please do not remove this and leave it at the end of the file, where
1107  * Emacs will automagically detect them.
1108  * ---------------------------------------------------------------------
1109  * Local variables:
1110  * mode: c
1111  * indent-tabs-mode: t
1112  * c-basic-offset: 4
1113  * tab-width: 4
1114  * End:
1115  */