src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2560 2005-06-06 15:20:41Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/stringlocal.h"
  45 #include "vm/tables.h"
  46 #include "vm/utf8.h"
  47
  48
  49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  50
  51
  52 /* utf-symbols for pointer comparison of frequently used strings **************/
  53
  54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  55
  56 utf *utf_java_lang_Class;
  57 utf *utf_java_lang_ClassLoader;
  58 utf *utf_java_lang_Cloneable;
  59 utf *utf_java_lang_SecurityManager;
  60 utf *utf_java_lang_String;
  61 utf *utf_java_lang_System;
  62 utf *utf_java_lang_ThreadGroup;
  63 utf *utf_java_io_Serializable;
  64
  65 utf *utf_java_lang_Throwable;
  66 utf *utf_java_lang_VMThrowable;
  67 utf *utf_java_lang_Error;
  68 utf *utf_java_lang_Exception;
  69 utf *utf_java_lang_NoClassDefFoundError;
  70 utf *utf_java_lang_OutOfMemoryError;
  71 utf *utf_java_lang_ClassNotFoundException;
  72
  73 utf* utf_java_lang_Void;
  74 utf* utf_java_lang_Boolean;
  75 utf* utf_java_lang_Byte;
  76 utf* utf_java_lang_Character;
  77 utf* utf_java_lang_Short;
  78 utf* utf_java_lang_Integer;
  79 utf* utf_java_lang_Long;
  80 utf* utf_java_lang_Float;
  81 utf* utf_java_lang_Double;
  82
  83 utf *utf_java_util_Vector;
  84 utf *utf_java_lang_reflect_Constructor;
  85 utf *utf_java_lang_reflect_Method;
  86
  87
  88 utf *utf_InnerClasses;                  /* InnerClasses                       */
  89 utf *utf_ConstantValue;                 /* ConstantValue                      */
  90 utf *utf_Code;                          /* Code                               */
  91 utf *utf_Exceptions;                    /* Exceptions                         */
  92 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  93 utf *utf_SourceFile;                    /* SourceFile                         */
  94
  95 utf *utf_init;                          /* <init>                             */
  96 utf *utf_clinit;                        /* <clinit>                           */
  97 utf *utf_clone;                         /* clone                              */
  98 utf *utf_finalize;                      /* finalize                           */
  99
 100 utf *utf_printStackTrace;
 101 utf *utf_fillInStackTrace;
 102 utf *utf_loadClass;
 103
 104 utf *utf_void__void;                    /* ()V                                */
 105 utf *utf_boolean__void;                 /* (Z)V                               */
 106 utf *utf_byte__void;                    /* (B)V                               */
 107 utf *utf_char__void;                    /* (C)V                               */
 108 utf *utf_short__void;                   /* (S)V                               */
 109 utf *utf_int__void;                     /* (I)V                               */
 110 utf *utf_long__void;                    /* (J)V                               */
 111 utf *utf_float__void;                   /* (F)V                               */
 112 utf *utf_double__void;                  /* (D)V                               */
 113 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 114 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 115 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 116 utf *utf_java_lang_String__java_lang_Class;
 117 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 118
 119 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 120
 121 utf *array_packagename;
 122
 123
 124 /* utf_init ********************************************************************
 125
 126    Initializes the utf8 subsystem.
 127
 128 *******************************************************************************/
 129
 130 void utf8_init(void)
 131 {
 132         /* create utf-symbols for pointer comparison of frequently used strings */
 133
 134         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 135
 136         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 137         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 138         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 139         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 140         utf_java_lang_String           = utf_new_char("java/lang/String");
 141         utf_java_lang_System           = utf_new_char("java/lang/System");
 142         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 143         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 144
 145         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 146         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 147         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 148         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 149
 150         utf_java_lang_NoClassDefFoundError =
 151                 utf_new_char(string_java_lang_NoClassDefFoundError);
 152
 153         utf_java_lang_OutOfMemoryError =
 154                 utf_new_char(string_java_lang_OutOfMemoryError);
 155
 156         utf_java_lang_ClassNotFoundException =
 157                 utf_new_char(string_java_lang_ClassNotFoundException);
 158
 159         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 160         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 161         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 162         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 163         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 164         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 165         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 166         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 167         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 168
 169         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 170
 171         utf_java_lang_reflect_Constructor =
 172                 utf_new_char("java/lang/reflect/Constructor");
 173
 174         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 175
 176         utf_InnerClasses               = utf_new_char("InnerClasses");
 177         utf_ConstantValue              = utf_new_char("ConstantValue");
 178         utf_Code                       = utf_new_char("Code");
 179         utf_Exceptions                 = utf_new_char("Exceptions");
 180         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 181         utf_SourceFile                 = utf_new_char("SourceFile");
 182
 183         utf_init                           = utf_new_char("<init>");
 184         utf_clinit                         = utf_new_char("<clinit>");
 185         utf_clone                      = utf_new_char("clone");
 186         utf_finalize                   = utf_new_char("finalize");
 187
 188         utf_printStackTrace            = utf_new_char("printStackTrace");
 189         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 190         utf_loadClass                  = utf_new_char("loadClass");
 191
 192         utf_void__void                 = utf_new_char("()V");
 193         utf_boolean__void              = utf_new_char("(Z)V");
 194         utf_byte__void                 = utf_new_char("(B)V");
 195         utf_char__void                 = utf_new_char("(C)V");
 196         utf_short__void                = utf_new_char("(S)V");
 197         utf_int__void                  = utf_new_char("(I)V");
 198         utf_long__void                 = utf_new_char("(J)V");
 199         utf_float__void                = utf_new_char("(F)V");
 200         utf_double__void               = utf_new_char("(D)V");
 201         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 202         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 203         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 204
 205         utf_java_lang_String__java_lang_Class =
 206                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 207
 208         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 209
 210         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 211
 212         array_packagename              = utf_new_char("\t<the array package>");
 213 }
 214
 215
 216 /* utf_hashkey *****************************************************************
 217
 218    The hashkey is computed from the utf-text by using up to 8
 219    characters.  For utf-symbols longer than 15 characters 3 characters
 220    are taken from the beginning and the end, 2 characters are taken
 221    from the middle.
 222
 223 *******************************************************************************/
 224
 225 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 226 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 227
 228 u4 utf_hashkey(const char *text, u4 length)
 229 {
 230         const char *start_pos = text;       /* pointer to utf text                */
 231         u4 a;
 232
 233         switch (length) {
 234         case 0: /* empty string */
 235                 return 0;
 236
 237         case 1: return fbs(0);
 238         case 2: return fbs(0) ^ nbs(3);
 239         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 240         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 241         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 242         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 243         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 244         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 245
 246         case 9:
 247                 a = fbs(0);
 248                 a ^= nbs(1);
 249                 a ^= nbs(2);
 250                 text++;
 251                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 252
 253         case 10:
 254                 a = fbs(0);
 255                 text++;
 256                 a ^= nbs(2);
 257                 a ^= nbs(3);
 258                 a ^= nbs(4);
 259                 text++;
 260                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 261
 262         case 11:
 263                 a = fbs(0);
 264                 text++;
 265                 a ^= nbs(2);
 266                 a ^= nbs(3);
 267                 a ^= nbs(4);
 268                 text++;
 269                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 270
 271         case 12:
 272                 a = fbs(0);
 273                 text += 2;
 274                 a ^= nbs(2);
 275                 a ^= nbs(3);
 276                 text++;
 277                 a ^= nbs(5);
 278                 a ^= nbs(6);
 279                 a ^= nbs(7);
 280                 text++;
 281                 return a ^ nbs(9) ^ nbs(10);
 282
 283         case 13:
 284                 a = fbs(0);
 285                 a ^= nbs(1);
 286                 text++;
 287                 a ^= nbs(3);
 288                 a ^= nbs(4);
 289                 text += 2;
 290                 a ^= nbs(7);
 291                 a ^= nbs(8);
 292                 text += 2;
 293                 return a ^ nbs(9) ^ nbs(10);
 294
 295         case 14:
 296                 a = fbs(0);
 297                 text += 2;
 298                 a ^= nbs(3);
 299                 a ^= nbs(4);
 300                 text += 2;
 301                 a ^= nbs(7);
 302                 a ^= nbs(8);
 303                 text += 2;
 304                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 305
 306         case 15:
 307                 a = fbs(0);
 308                 text += 2;
 309                 a ^= nbs(3);
 310                 a ^= nbs(4);
 311                 text += 2;
 312                 a ^= nbs(7);
 313                 a ^= nbs(8);
 314                 text += 2;
 315                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 316
 317         default:  /* 3 characters from beginning */
 318                 a = fbs(0);
 319                 text += 2;
 320                 a ^= nbs(3);
 321                 a ^= nbs(4);
 322
 323                 /* 2 characters from middle */
 324                 text = start_pos + (length / 2);
 325                 a ^= fbs(5);
 326                 text += 2;
 327                 a ^= nbs(6);
 328
 329                 /* 3 characters from end */
 330                 text = start_pos + length - 4;
 331
 332                 a ^= fbs(7);
 333                 text++;
 334
 335                 return a ^ nbs(10) ^ nbs(11);
 336     }
 337 }
 338
 339
 340 /* utf_hashkey *****************************************************************
 341
 342    Compute the hashkey of a unicode string.
 343
 344 *******************************************************************************/
 345
 346 u4 unicode_hashkey(u2 *text, u2 len)
 347 {
 348         return utf_hashkey((char *) text, len);
 349 }
 350
 351
 352 /* utf_new *********************************************************************
 353
 354    Creates a new utf-symbol, the text of the symbol is passed as a
 355    u1-array. The function searches the utf-hashtable for a utf-symbol
 356    with this text. On success the element returned, otherwise a new
 357    hashtable element is created.
 358
 359    If the number of entries in the hashtable exceeds twice the size of
 360    the hashtable slots a reorganization of the hashtable is done and
 361    the utf symbols are copied to a new hashtable with doubled size.
 362
 363 *******************************************************************************/
 364
 365 utf *utf_new_intern(const char *text, u2 length);
 366
 367 utf *utf_new(const char *text, u2 length)
 368 {
 369     utf *r;
 370
 371 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 372     tables_lock();
 373 #endif
 374
 375     r = utf_new_intern(text, length);
 376
 377 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 378     tables_unlock();
 379 #endif
 380
 381     return r;
 382 }
 383
 384
 385 utf *utf_new_intern(const char *text, u2 length)
 386 {
 387         u4 key;                             /* hashkey computed from utf-text     */
 388         u4 slot;                            /* slot in hashtable                  */
 389         utf *u;                             /* hashtable element                  */
 390         u2 i;
 391
 392 #ifdef STATISTICS
 393         if (opt_stat)
 394                 count_utf_new++;
 395 #endif
 396
 397         key  = utf_hashkey(text, length);
 398         slot = key & (utf_hash.size - 1);
 399         u    = utf_hash.ptr[slot];
 400
 401         /* search external hash chain for utf-symbol */
 402         while (u) {
 403                 if (u->blength == length) {
 404
 405                         /* compare text of hashtable elements */
 406                         for (i = 0; i < length; i++)
 407                                 if (text[i] != u->text[i]) goto nomatch;
 408
 409 #ifdef STATISTICS
 410                         if (opt_stat)
 411                                 count_utf_new_found++;
 412 #endif
 413
 414                         /* symbol found in hashtable */
 415                         return u;
 416                 }
 417         nomatch:
 418                 u = u->hashlink; /* next element in external chain */
 419         }
 420
 421 #ifdef STATISTICS
 422         if (opt_stat)
 423                 count_utf_len += sizeof(utf) + length;
 424 #endif
 425
 426         /* location in hashtable found, create new utf element */
 427         u = NEW(utf);
 428         u->blength  = length;               /* length in bytes of utfstring       */
 429         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 430         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 431         memcpy(u->text, text, length);      /* copy utf-text                      */
 432         u->text[length] = '\0';
 433         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 434
 435         utf_hash.entries++;                 /* update number of entries           */
 436
 437         if (utf_hash.entries > (utf_hash.size * 2)) {
 438
 439         /* reorganization of hashtable, average length of
 440            the external chains is approx. 2                */
 441
 442                 u4 i;
 443                 utf *u;
 444                 hashtable newhash; /* the new hashtable */
 445
 446                 /* create new hashtable, double the size */
 447                 init_hashtable(&newhash, utf_hash.size * 2);
 448                 newhash.entries = utf_hash.entries;
 449
 450 #ifdef STATISTICS
 451                 if (opt_stat)
 452                         count_utf_len += sizeof(utf*) * utf_hash.size;
 453 #endif
 454
 455                 /* transfer elements to new hashtable */
 456                 for (i = 0; i < utf_hash.size; i++) {
 457                         u = (utf *) utf_hash.ptr[i];
 458                         while (u) {
 459                                 utf *nextu = u->hashlink;
 460                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 461
 462                                 u->hashlink = (utf *) newhash.ptr[slot];
 463                                 newhash.ptr[slot] = u;
 464
 465                                 /* follow link in external hash chain */
 466                                 u = nextu;
 467                         }
 468                 }
 469
 470                 /* dispose old table */
 471                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 472                 utf_hash = newhash;
 473         }
 474
 475         return u;
 476 }
 477
 478
 479 /* utf_new_u2 ******************************************************************
 480
 481    Make utf symbol from u2 array, if isclassname is true '.' is
 482    replaced by '/'.
 483
 484 *******************************************************************************/
 485
 486 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 487 {
 488         char *buffer;                   /* memory buffer for  unicode characters  */
 489         char *pos;                      /* pointer to current position in buffer  */
 490         u4 left;                        /* unicode characters left                */
 491         u4 buflength;                   /* utf length in bytes of the u2 array    */
 492         utf *result;                    /* resulting utf-string                   */
 493         int i;
 494
 495         /* determine utf length in bytes and allocate memory */
 496
 497         buflength = u2_utflength(unicode_pos, unicode_length);
 498         buffer    = MNEW(char, buflength);
 499
 500         left = buflength;
 501         pos  = buffer;
 502
 503         for (i = 0; i++ < unicode_length; unicode_pos++) {
 504                 /* next unicode character */
 505                 u2 c = *unicode_pos;
 506
 507                 if ((c != 0) && (c < 0x80)) {
 508                         /* 1 character */
 509                         left--;
 510                 if ((int) left < 0) break;
 511                         /* convert classname */
 512                         if (isclassname && c == '.')
 513                                 *pos++ = '/';
 514                         else
 515                                 *pos++ = (char) c;
 516
 517                 } else if (c < 0x800) {
 518                         /* 2 characters */
 519                 unsigned char high = c >> 6;
 520                 unsigned char low  = c & 0x3F;
 521                         left = left - 2;
 522                 if ((int) left < 0) break;
 523                 *pos++ = high | 0xC0;
 524                 *pos++ = low  | 0x80;
 525
 526                 } else {
 527                 /* 3 characters */
 528                 char low  = c & 0x3f;
 529                 char mid  = (c >> 6) & 0x3F;
 530                 char high = c >> 12;
 531                         left = left - 3;
 532                 if ((int) left < 0) break;
 533                 *pos++ = high | 0xE0;
 534                 *pos++ = mid  | 0x80;
 535                 *pos++ = low  | 0x80;
 536                 }
 537         }
 538
 539         /* insert utf-string into symbol-table */
 540         result = utf_new(buffer,buflength);
 541
 542         MFREE(buffer, char, buflength);
 543
 544         return result;
 545 }
 546
 547
 548 /* utf_new_char ****************************************************************
 549
 550    Creates a new utf symbol, the text for this symbol is passed as a
 551    c-string ( = char* ).
 552
 553 *******************************************************************************/
 554
 555 utf *utf_new_char(const char *text)
 556 {
 557         return utf_new(text, strlen(text));
 558 }
 559
 560
 561 /* utf_new_char_classname ******************************************************
 562
 563    Creates a new utf symbol, the text for this symbol is passed as a
 564    c-string ( = char* ) "." characters are going to be replaced by
 565    "/". Since the above function is used often, this is a separte
 566    function, instead of an if.
 567
 568 *******************************************************************************/
 569
 570 utf *utf_new_char_classname(const char *text)
 571 {
 572         if (strchr(text, '.')) {
 573                 char *txt = strdup(text);
 574                 char *end = txt + strlen(txt);
 575                 char *c;
 576                 utf *tmpRes;
 577
 578                 for (c = txt; c < end; c++)
 579                         if (*c == '.') *c = '/';
 580
 581                 tmpRes = utf_new(txt, strlen(txt));
 582                 FREE(txt, 0);
 583
 584                 return tmpRes;
 585
 586         } else
 587                 return utf_new(text, strlen(text));
 588 }
 589
 590
 591 /* utf_nextu2 ******************************************************************
 592
 593    Read the next unicode character from the utf string and increment
 594    the utf-string pointer accordingly.
 595
 596 *******************************************************************************/
 597
 598 u2 utf_nextu2(char **utf_ptr)
 599 {
 600     /* uncompressed unicode character */
 601     u2 unicode_char = 0;
 602     /* current position in utf text */
 603     unsigned char *utf = (unsigned char *) (*utf_ptr);
 604     /* bytes representing the unicode character */
 605     unsigned char ch1, ch2, ch3;
 606     /* number of bytes used to represent the unicode character */
 607     int len = 0;
 608
 609     switch ((ch1 = utf[0]) >> 4) {
 610         default: /* 1 byte */
 611                 (*utf_ptr)++;
 612                 return (u2) ch1;
 613         case 0xC:
 614         case 0xD: /* 2 bytes */
 615                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 616                         unsigned char high = ch1 & 0x1F;
 617                         unsigned char low  = ch2 & 0x3F;
 618                         unicode_char = (high << 6) + low;
 619                         len = 2;
 620                 }
 621                 break;
 622
 623         case 0xE: /* 2 or 3 bytes */
 624                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 625                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 626                                 unsigned char low  = ch3 & 0x3f;
 627                                 unsigned char mid  = ch2 & 0x3f;
 628                                 unsigned char high = ch1 & 0x0f;
 629                                 unicode_char = (((high << 6) + mid) << 6) + low;
 630                                 len = 3;
 631                         } else
 632                                 len = 2;
 633                 }
 634                 break;
 635     }
 636
 637     /* update position in utf-text */
 638     *utf_ptr = (char *) (utf + len);
 639
 640     return unicode_char;
 641 }
 642
 643
 644 /* utf_strlen ******************************************************************
 645
 646    Determine number of unicode characters in the utf string.
 647
 648 *******************************************************************************/
 649
 650 u4 utf_strlen(utf *u)
 651 {
 652         char *endpos;                       /* points behind utf string           */
 653         char *utf_ptr;                      /* current position in utf text       */
 654         u4 len = 0;                         /* number of unicode characters       */
 655
 656         if (!u) {
 657                 *exceptionptr = new_nullpointerexception();
 658                 return 0;
 659         }
 660
 661         endpos = utf_end(u);
 662         utf_ptr = u->text;
 663
 664         while (utf_ptr < endpos) {
 665                 len++;
 666                 /* next unicode character */
 667                 utf_nextu2(&utf_ptr);
 668         }
 669
 670         if (utf_ptr != endpos)
 671                 /* string ended abruptly */
 672                 throw_cacao_exception_exit(string_java_lang_InternalError,
 673                                                                    "Illegal utf8 string");
 674
 675         return len;
 676 }
 677
 678
 679 /* u2_utflength ****************************************************************
 680
 681    Returns the utf length in bytes of a u2 array.
 682
 683 *******************************************************************************/
 684
 685 u4 u2_utflength(u2 *text, u4 u2_length)
 686 {
 687         u4 result_len = 0;                  /* utf length in bytes                */
 688         u2 ch;                              /* current unicode character          */
 689         u4 len;
 690
 691         for (len = 0; len < u2_length; len++) {
 692                 /* next unicode character */
 693                 ch = *text++;
 694
 695                 /* determine bytes required to store unicode character as utf */
 696                 if (ch && (ch < 0x80))
 697                         result_len++;
 698                 else if (ch < 0x800)
 699                         result_len += 2;
 700                 else
 701                         result_len += 3;
 702         }
 703
 704     return result_len;
 705 }
 706
 707
 708 /* utf_display *****************************************************************
 709
 710    Write utf symbol to stdout (for debugging purposes).
 711
 712 *******************************************************************************/
 713
 714 void utf_display(utf *u)
 715 {
 716         char *endpos;                       /* points behind utf string           */
 717         char *utf_ptr;                      /* current position in utf text       */
 718
 719         if (!u) {
 720                 printf("NULL");
 721                 fflush(stdout);
 722                 return;
 723         }
 724
 725         endpos = utf_end(u);
 726         utf_ptr = u->text;
 727
 728         while (utf_ptr < endpos) {
 729                 /* read next unicode character */
 730                 u2 c = utf_nextu2(&utf_ptr);
 731                 if (c >= 32 && c <= 127) printf("%c", c);
 732                 else printf("?");
 733         }
 734
 735         fflush(stdout);
 736 }
 737
 738
 739 /* utf_display_classname *******************************************************
 740
 741    Write utf symbol to stdout with `/' converted to `.' (for debugging
 742    purposes).
 743
 744 *******************************************************************************/
 745
 746 void utf_display_classname(utf *u)
 747 {
 748         char *endpos;                       /* points behind utf string           */
 749         char *utf_ptr;                      /* current position in utf text       */
 750
 751         if (!u) {
 752                 printf("NULL");
 753                 fflush(stdout);
 754                 return;
 755         }
 756
 757         endpos = utf_end(u);
 758         utf_ptr = u->text;
 759
 760         while (utf_ptr < endpos) {
 761                 /* read next unicode character */
 762                 u2 c = utf_nextu2(&utf_ptr);
 763                 if (c == '/') c = '.';
 764                 if (c >= 32 && c <= 127) printf("%c", c);
 765                 else printf("?");
 766         }
 767
 768         fflush(stdout);
 769 }
 770
 771
 772 /* utf_sprint ******************************************************************
 773
 774    Write utf symbol into c-string (for debugging purposes).
 775
 776 *******************************************************************************/
 777
 778 void utf_sprint(char *buffer, utf *u)
 779 {
 780         char *endpos;                       /* points behind utf string           */
 781         char *utf_ptr;                      /* current position in utf text       */
 782         u2 pos = 0;                         /* position in c-string               */
 783
 784         if (!u) {
 785                 strcpy(buffer, "NULL");
 786                 return;
 787         }
 788
 789         endpos = utf_end(u);
 790         utf_ptr = u->text;
 791
 792         while (utf_ptr < endpos)
 793                 /* copy next unicode character */
 794                 buffer[pos++] = utf_nextu2(&utf_ptr);
 795
 796         /* terminate string */
 797         buffer[pos] = '\0';
 798 }
 799
 800
 801 /* utf_sprint_classname ********************************************************
 802
 803    Write utf symbol into c-string with `/' converted to `.' (for debugging
 804    purposes).
 805
 806 *******************************************************************************/
 807
 808 void utf_sprint_classname(char *buffer, utf *u)
 809 {
 810         char *endpos;                       /* points behind utf string           */
 811         char *utf_ptr;                      /* current position in utf text       */
 812         u2 pos = 0;                         /* position in c-string               */
 813
 814         if (!u) {
 815                 strcpy(buffer, "NULL");
 816                 return;
 817         }
 818
 819         endpos = utf_end(u);
 820         utf_ptr = u->text;
 821
 822         while (utf_ptr < endpos) {
 823                 /* copy next unicode character */
 824                 u2 c = utf_nextu2(&utf_ptr);
 825                 if (c == '/') c = '.';
 826                 buffer[pos++] = c;
 827         }
 828
 829         /* terminate string */
 830         buffer[pos] = '\0';
 831 }
 832
 833
 834 /* utf_strcat ******************************************************************
 835
 836    Like libc strcat, but uses an utf8 string.
 837
 838 *******************************************************************************/
 839
 840 void utf_strcat(char *buffer, utf *u)
 841 {
 842         utf_sprint(buffer + strlen(buffer), u);
 843 }
 844
 845
 846 /* utf_strcat_classname ********************************************************
 847
 848    Like libc strcat, but uses an utf8 string.
 849
 850 *******************************************************************************/
 851
 852 void utf_strcat_classname(char *buffer, utf *u)
 853 {
 854         utf_sprint_classname(buffer + strlen(buffer), u);
 855 }
 856
 857
 858 /* utf_fprint ******************************************************************
 859
 860    Write utf symbol into file.
 861
 862 *******************************************************************************/
 863
 864 void utf_fprint(FILE *file, utf *u)
 865 {
 866         char *endpos;                       /* points behind utf string           */
 867         char *utf_ptr;                      /* current position in utf text       */
 868
 869         if (!u)
 870                 return;
 871
 872         endpos = utf_end(u);
 873         utf_ptr = u->text;
 874
 875         while (utf_ptr < endpos) {
 876                 /* read next unicode character */
 877                 u2 c = utf_nextu2(&utf_ptr);
 878
 879                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 880                 else fprintf(file, "?");
 881         }
 882 }
 883
 884
 885 /* utf_fprint_classname ********************************************************
 886
 887    Write utf symbol into file with `/' converted to `.'.
 888
 889 *******************************************************************************/
 890
 891 void utf_fprint_classname(FILE *file, utf *u)
 892 {
 893         char *endpos;                       /* points behind utf string           */
 894         char *utf_ptr;                      /* current position in utf text       */
 895
 896     if (!u)
 897                 return;
 898
 899         endpos = utf_end(u);
 900         utf_ptr = u->text;
 901
 902         while (utf_ptr < endpos) {
 903                 /* read next unicode character */
 904                 u2 c = utf_nextu2(&utf_ptr);
 905                 if (c == '/') c = '.';
 906
 907                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 908                 else fprintf(file, "?");
 909         }
 910 }
 911
 912
 913 /* is_valid_utf ****************************************************************
 914
 915    Return true if the given string is a valid UTF-8 string.
 916
 917    utf_ptr...points to first character
 918    end_pos...points after last character
 919
 920 *******************************************************************************/
 921
 922 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 923
 924 bool is_valid_utf(char *utf_ptr, char *end_pos)
 925 {
 926         int bytes;
 927         int len,i;
 928         char c;
 929         unsigned long v;
 930
 931         if (end_pos < utf_ptr) return false;
 932         bytes = end_pos - utf_ptr;
 933         while (bytes--) {
 934                 c = *utf_ptr++;
 935
 936                 if (!c) return false;                     /* 0x00 is not allowed */
 937                 if ((c & 0x80) == 0) continue;            /* ASCII */
 938
 939                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 940                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 941                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 942                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 943                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 944                 else return false;                        /* invalid leading byte */
 945
 946                 if (len > 2) return false;                /* Java limitation */
 947
 948                 v = (unsigned long)c & (0x3f >> len);
 949
 950                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 951
 952                 for (i = len; i--; ) {
 953                         c = *utf_ptr++;
 954                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 955                                 return false;
 956                         v = (v << 6) | (c & 0x3f);
 957                 }
 958
 959                 if (v == 0) {
 960                         if (len != 1) return false;           /* Java special */
 961
 962                 } else {
 963                         /* Sun Java seems to allow overlong UTF-8 encodings */
 964
 965                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 966                                 if (!opt_liberalutf)
 967                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 968                                 /* XXX change this to exception? */
 969                         }
 970                 }
 971
 972                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 973                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 974
 975                 /* even these seem to be allowed */
 976                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 977         }
 978
 979         return true;
 980 }
 981
 982
 983 /* is_valid_name ***************************************************************
 984
 985    Return true if the given string may be used as a class/field/method
 986    name. (Currently this only disallows empty strings and control
 987    characters.)
 988
 989    NOTE: The string is assumed to have passed is_valid_utf!
 990
 991    utf_ptr...points to first character
 992    end_pos...points after last character
 993
 994 *******************************************************************************/
 995
 996 bool is_valid_name(char *utf_ptr, char *end_pos)
 997 {
 998         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 999
1000         while (utf_ptr < end_pos) {
1001                 unsigned char c = *utf_ptr++;
1002
1003                 if (c < 0x20) return false; /* disallow control characters */
1004                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1005                         return false;
1006         }
1007
1008         return true;
1009 }
1010
1011 bool is_valid_name_utf(utf *u)
1012 {
1013         return is_valid_name(u->text,utf_end(u));
1014 }
1015
1016
1017 /* utf_show ********************************************************************
1018
1019    Writes the utf symbols in the utfhash to stdout and displays the
1020    number of external hash chains grouped according to the chainlength
1021    (for debugging purposes).
1022
1023 *******************************************************************************/
1024
1025 void utf_show(void)
1026 {
1027
1028 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1029
1030         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1031         u4 max_chainlength = 0;      /* maximum length of the chains */
1032         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1033         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1034         u4 i;
1035
1036         printf ("UTF-HASH:\n");
1037
1038         /* show element of utf-hashtable */
1039         for (i=0; i<utf_hash.size; i++) {
1040                 utf *u = utf_hash.ptr[i];
1041                 if (u) {
1042                         printf ("SLOT %d: ", (int) i);
1043                         while (u) {
1044                                 printf ("'");
1045                                 utf_display (u);
1046                                 printf ("' ");
1047                                 u = u->hashlink;
1048                         }
1049                         printf ("\n");
1050                 }
1051
1052         }
1053
1054         printf ("UTF-HASH: %d slots for %d entries\n",
1055                         (int) utf_hash.size, (int) utf_hash.entries );
1056
1057
1058         if (utf_hash.entries == 0)
1059                 return;
1060
1061         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1062
1063         for (i=0;i<CHAIN_LIMIT;i++)
1064                 chain_count[i]=0;
1065
1066         /* count numbers of hashchains according to their length */
1067         for (i=0; i<utf_hash.size; i++) {
1068
1069                 utf *u = (utf*) utf_hash.ptr[i];
1070                 u4 chain_length = 0;
1071
1072                 /* determine chainlength */
1073                 while (u) {
1074                         u = u->hashlink;
1075                         chain_length++;
1076                 }
1077
1078                 /* update sum of all chainlengths */
1079                 sum_chainlength+=chain_length;
1080
1081                 /* determine the maximum length of the chains */
1082                 if (chain_length>max_chainlength)
1083                         max_chainlength = chain_length;
1084
1085                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1086                 if (chain_length>=CHAIN_LIMIT) {
1087                         beyond_limit+=chain_length;
1088                         chain_length=CHAIN_LIMIT-1;
1089                 }
1090
1091                 /* update number of hashchains of current length */
1092                 chain_count[chain_length]++;
1093         }
1094
1095         /* display results */
1096         for (i=1;i<CHAIN_LIMIT-1;i++)
1097                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1098
1099         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1100
1101
1102         printf("max. chainlength:%5d\n",max_chainlength);
1103
1104         /* avg. chainlength = sum of chainlengths / number of chains */
1105         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1106 }
1107
1108
1109 /*
1110  * These are local overrides for various environment variables in Emacs.
1111  * Please do not remove this and leave it at the end of the file, where
1112  * Emacs will automagically detect them.
1113  * ---------------------------------------------------------------------
1114  * Local variables:
1115  * mode: c
1116  * indent-tabs-mode: t
1117  * c-basic-offset: 4
1118  * tab-width: 4
1119  * End:
1120  */