src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2506 2005-05-23 08:32:38Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/stringlocal.h"
  45 #include "vm/tables.h"
  46 #include "vm/utf8.h"
  47
  48
  49 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  50
  51
  52 /* utf-symbols for pointer comparison of frequently used strings **************/
  53
  54 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  55
  56 utf *utf_java_lang_Class;
  57 utf *utf_java_lang_ClassLoader;
  58 utf *utf_java_lang_Cloneable;
  59 utf *utf_java_lang_SecurityManager;
  60 utf *utf_java_lang_String;
  61 utf *utf_java_lang_System;
  62 utf *utf_java_lang_ThreadGroup;
  63 utf *utf_java_io_Serializable;
  64
  65 utf *utf_java_lang_Throwable;
  66 utf *utf_java_lang_VMThrowable;
  67 utf *utf_java_lang_Error;
  68 utf *utf_java_lang_Exception;
  69 utf *utf_java_lang_NoClassDefFoundError;
  70 utf *utf_java_lang_OutOfMemoryError;
  71 utf *utf_java_lang_ClassNotFoundException;
  72
  73 utf* utf_java_lang_Void;
  74 utf* utf_java_lang_Boolean;
  75 utf* utf_java_lang_Byte;
  76 utf* utf_java_lang_Character;
  77 utf* utf_java_lang_Short;
  78 utf* utf_java_lang_Integer;
  79 utf* utf_java_lang_Long;
  80 utf* utf_java_lang_Float;
  81 utf* utf_java_lang_Double;
  82
  83 utf *utf_java_util_Vector;
  84 utf *utf_java_lang_reflect_Constructor;
  85 utf *utf_java_lang_reflect_Method;
  86
  87
  88 utf *utf_InnerClasses;                  /* InnerClasses                       */
  89 utf *utf_ConstantValue;                 /* ConstantValue                      */
  90 utf *utf_Code;                          /* Code                               */
  91 utf *utf_Exceptions;                    /* Exceptions                         */
  92 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  93 utf *utf_SourceFile;                    /* SourceFile                         */
  94
  95 utf *utf_init;                          /* <init>                             */
  96 utf *utf_clinit;                        /* <clinit>                           */
  97 utf *utf_finalize;                      /* finalize                           */
  98
  99 utf *utf_printStackTrace;
 100 utf *utf_fillInStackTrace;
 101 utf *utf_loadClass;
 102
 103 utf *utf_void__void;                    /* ()V                                */
 104 utf *utf_boolean__void;                 /* (Z)V                               */
 105 utf *utf_byte__void;                    /* (B)V                               */
 106 utf *utf_char__void;                    /* (C)V                               */
 107 utf *utf_short__void;                   /* (S)V                               */
 108 utf *utf_int__void;                     /* (I)V                               */
 109 utf *utf_long__void;                    /* (J)V                               */
 110 utf *utf_float__void;                   /* (F)V                               */
 111 utf *utf_double__void;                  /* (D)V                               */
 112 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 113 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 114 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 115 utf *utf_java_lang_String__java_lang_Class;
 116 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 117
 118 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 119
 120 utf *array_packagename;
 121
 122
 123 /* utf_init ********************************************************************
 124
 125    Initializes the utf8 subsystem.
 126
 127 *******************************************************************************/
 128
 129 void utf8_init(void)
 130 {
 131         /* create utf-symbols for pointer comparison of frequently used strings */
 132
 133         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 134
 135         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 136         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 137         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 138         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 139         utf_java_lang_String           = utf_new_char("java/lang/String");
 140         utf_java_lang_System           = utf_new_char("java/lang/System");
 141         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 142         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 143
 144         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 145         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 146         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 147         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 148
 149         utf_java_lang_NoClassDefFoundError =
 150                 utf_new_char(string_java_lang_NoClassDefFoundError);
 151
 152         utf_java_lang_OutOfMemoryError =
 153                 utf_new_char(string_java_lang_OutOfMemoryError);
 154
 155         utf_java_lang_ClassNotFoundException =
 156                 utf_new_char(string_java_lang_ClassNotFoundException);
 157
 158         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 159         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 160         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 161         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 162         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 163         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 164         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 165         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 166         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 167
 168         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 169
 170         utf_java_lang_reflect_Constructor =
 171                 utf_new_char("java/lang/reflect/Constructor");
 172
 173         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 174
 175         utf_InnerClasses               = utf_new_char("InnerClasses");
 176         utf_ConstantValue              = utf_new_char("ConstantValue");
 177         utf_Code                       = utf_new_char("Code");
 178         utf_Exceptions                 = utf_new_char("Exceptions");
 179         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 180         utf_SourceFile                 = utf_new_char("SourceFile");
 181
 182         utf_init                           = utf_new_char("<init>");
 183         utf_clinit                         = utf_new_char("<clinit>");
 184         utf_finalize                   = utf_new_char("finalize");
 185
 186         utf_printStackTrace            = utf_new_char("printStackTrace");
 187         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 188         utf_loadClass                  = utf_new_char("loadClass");
 189
 190         utf_void__void                 = utf_new_char("()V");
 191         utf_boolean__void              = utf_new_char("(Z)V");
 192         utf_byte__void                 = utf_new_char("(B)V");
 193         utf_char__void                 = utf_new_char("(C)V");
 194         utf_short__void                = utf_new_char("(S)V");
 195         utf_int__void                  = utf_new_char("(I)V");
 196         utf_long__void                 = utf_new_char("(J)V");
 197         utf_float__void                = utf_new_char("(F)V");
 198         utf_double__void               = utf_new_char("(D)V");
 199         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 200         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 201         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 202
 203         utf_java_lang_String__java_lang_Class =
 204                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 205
 206         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 207
 208         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 209
 210         array_packagename              = utf_new_char("\t<the array package>");
 211 }
 212
 213
 214 /* utf_hashkey *****************************************************************
 215
 216    The hashkey is computed from the utf-text by using up to 8
 217    characters.  For utf-symbols longer than 15 characters 3 characters
 218    are taken from the beginning and the end, 2 characters are taken
 219    from the middle.
 220
 221 *******************************************************************************/
 222
 223 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 224 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 225
 226 u4 utf_hashkey(const char *text, u4 length)
 227 {
 228         const char *start_pos = text;       /* pointer to utf text                */
 229         u4 a;
 230
 231         switch (length) {
 232         case 0: /* empty string */
 233                 return 0;
 234
 235         case 1: return fbs(0);
 236         case 2: return fbs(0) ^ nbs(3);
 237         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 238         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 239         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 240         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 241         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 242         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 243
 244         case 9:
 245                 a = fbs(0);
 246                 a ^= nbs(1);
 247                 a ^= nbs(2);
 248                 text++;
 249                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 250
 251         case 10:
 252                 a = fbs(0);
 253                 text++;
 254                 a ^= nbs(2);
 255                 a ^= nbs(3);
 256                 a ^= nbs(4);
 257                 text++;
 258                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 259
 260         case 11:
 261                 a = fbs(0);
 262                 text++;
 263                 a ^= nbs(2);
 264                 a ^= nbs(3);
 265                 a ^= nbs(4);
 266                 text++;
 267                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 268
 269         case 12:
 270                 a = fbs(0);
 271                 text += 2;
 272                 a ^= nbs(2);
 273                 a ^= nbs(3);
 274                 text++;
 275                 a ^= nbs(5);
 276                 a ^= nbs(6);
 277                 a ^= nbs(7);
 278                 text++;
 279                 return a ^ nbs(9) ^ nbs(10);
 280
 281         case 13:
 282                 a = fbs(0);
 283                 a ^= nbs(1);
 284                 text++;
 285                 a ^= nbs(3);
 286                 a ^= nbs(4);
 287                 text += 2;
 288                 a ^= nbs(7);
 289                 a ^= nbs(8);
 290                 text += 2;
 291                 return a ^ nbs(9) ^ nbs(10);
 292
 293         case 14:
 294                 a = fbs(0);
 295                 text += 2;
 296                 a ^= nbs(3);
 297                 a ^= nbs(4);
 298                 text += 2;
 299                 a ^= nbs(7);
 300                 a ^= nbs(8);
 301                 text += 2;
 302                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 303
 304         case 15:
 305                 a = fbs(0);
 306                 text += 2;
 307                 a ^= nbs(3);
 308                 a ^= nbs(4);
 309                 text += 2;
 310                 a ^= nbs(7);
 311                 a ^= nbs(8);
 312                 text += 2;
 313                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 314
 315         default:  /* 3 characters from beginning */
 316                 a = fbs(0);
 317                 text += 2;
 318                 a ^= nbs(3);
 319                 a ^= nbs(4);
 320
 321                 /* 2 characters from middle */
 322                 text = start_pos + (length / 2);
 323                 a ^= fbs(5);
 324                 text += 2;
 325                 a ^= nbs(6);
 326
 327                 /* 3 characters from end */
 328                 text = start_pos + length - 4;
 329
 330                 a ^= fbs(7);
 331                 text++;
 332
 333                 return a ^ nbs(10) ^ nbs(11);
 334     }
 335 }
 336
 337
 338 /* utf_hashkey *****************************************************************
 339
 340    Compute the hashkey of a unicode string.
 341
 342 *******************************************************************************/
 343
 344 u4 unicode_hashkey(u2 *text, u2 len)
 345 {
 346         return utf_hashkey((char *) text, len);
 347 }
 348
 349
 350 /* utf_new *********************************************************************
 351
 352    Creates a new utf-symbol, the text of the symbol is passed as a
 353    u1-array. The function searches the utf-hashtable for a utf-symbol
 354    with this text. On success the element returned, otherwise a new
 355    hashtable element is created.
 356
 357    If the number of entries in the hashtable exceeds twice the size of
 358    the hashtable slots a reorganization of the hashtable is done and
 359    the utf symbols are copied to a new hashtable with doubled size.
 360
 361 *******************************************************************************/
 362
 363 utf *utf_new_intern(const char *text, u2 length);
 364
 365 utf *utf_new(const char *text, u2 length)
 366 {
 367     utf *r;
 368
 369 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 370     tables_lock();
 371 #endif
 372
 373     r = utf_new_intern(text, length);
 374
 375 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 376     tables_unlock();
 377 #endif
 378
 379     return r;
 380 }
 381
 382
 383 utf *utf_new_intern(const char *text, u2 length)
 384 {
 385         u4 key;                             /* hashkey computed from utf-text     */
 386         u4 slot;                            /* slot in hashtable                  */
 387         utf *u;                             /* hashtable element                  */
 388         u2 i;
 389
 390 #ifdef STATISTICS
 391         if (opt_stat)
 392                 count_utf_new++;
 393 #endif
 394
 395         key  = utf_hashkey(text, length);
 396         slot = key & (utf_hash.size - 1);
 397         u    = utf_hash.ptr[slot];
 398
 399         /* search external hash chain for utf-symbol */
 400         while (u) {
 401                 if (u->blength == length) {
 402
 403                         /* compare text of hashtable elements */
 404                         for (i = 0; i < length; i++)
 405                                 if (text[i] != u->text[i]) goto nomatch;
 406
 407 #ifdef STATISTICS
 408                         if (opt_stat)
 409                                 count_utf_new_found++;
 410 #endif
 411
 412                         /* symbol found in hashtable */
 413                         return u;
 414                 }
 415         nomatch:
 416                 u = u->hashlink; /* next element in external chain */
 417         }
 418
 419 #ifdef STATISTICS
 420         if (opt_stat)
 421                 count_utf_len += sizeof(utf) + length;
 422 #endif
 423
 424         /* location in hashtable found, create new utf element */
 425         u = NEW(utf);
 426         u->blength  = length;               /* length in bytes of utfstring       */
 427         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 428         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 429         memcpy(u->text, text, length);      /* copy utf-text                      */
 430         u->text[length] = '\0';
 431         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 432
 433         utf_hash.entries++;                 /* update number of entries           */
 434
 435         if (utf_hash.entries > (utf_hash.size * 2)) {
 436
 437         /* reorganization of hashtable, average length of
 438            the external chains is approx. 2                */
 439
 440                 u4 i;
 441                 utf *u;
 442                 hashtable newhash; /* the new hashtable */
 443
 444                 /* create new hashtable, double the size */
 445                 init_hashtable(&newhash, utf_hash.size * 2);
 446                 newhash.entries = utf_hash.entries;
 447
 448 #ifdef STATISTICS
 449                 if (opt_stat)
 450                         count_utf_len += sizeof(utf*) * utf_hash.size;
 451 #endif
 452
 453                 /* transfer elements to new hashtable */
 454                 for (i = 0; i < utf_hash.size; i++) {
 455                         u = (utf *) utf_hash.ptr[i];
 456                         while (u) {
 457                                 utf *nextu = u->hashlink;
 458                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 459
 460                                 u->hashlink = (utf *) newhash.ptr[slot];
 461                                 newhash.ptr[slot] = u;
 462
 463                                 /* follow link in external hash chain */
 464                                 u = nextu;
 465                         }
 466                 }
 467
 468                 /* dispose old table */
 469                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 470                 utf_hash = newhash;
 471         }
 472
 473         return u;
 474 }
 475
 476
 477 /* utf_new_u2 ******************************************************************
 478
 479    Make utf symbol from u2 array, if isclassname is true '.' is
 480    replaced by '/'.
 481
 482 *******************************************************************************/
 483
 484 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 485 {
 486         char *buffer;                   /* memory buffer for  unicode characters  */
 487         char *pos;                      /* pointer to current position in buffer  */
 488         u4 left;                        /* unicode characters left                */
 489         u4 buflength;                   /* utf length in bytes of the u2 array    */
 490         utf *result;                    /* resulting utf-string                   */
 491         int i;
 492
 493         /* determine utf length in bytes and allocate memory */
 494
 495         buflength = u2_utflength(unicode_pos, unicode_length);
 496         buffer    = MNEW(char, buflength);
 497
 498         left = buflength;
 499         pos  = buffer;
 500
 501         for (i = 0; i++ < unicode_length; unicode_pos++) {
 502                 /* next unicode character */
 503                 u2 c = *unicode_pos;
 504
 505                 if ((c != 0) && (c < 0x80)) {
 506                         /* 1 character */
 507                         left--;
 508                 if ((int) left < 0) break;
 509                         /* convert classname */
 510                         if (isclassname && c == '.')
 511                                 *pos++ = '/';
 512                         else
 513                                 *pos++ = (char) c;
 514
 515                 } else if (c < 0x800) {
 516                         /* 2 characters */
 517                 unsigned char high = c >> 6;
 518                 unsigned char low  = c & 0x3F;
 519                         left = left - 2;
 520                 if ((int) left < 0) break;
 521                 *pos++ = high | 0xC0;
 522                 *pos++ = low  | 0x80;
 523
 524                 } else {
 525                 /* 3 characters */
 526                 char low  = c & 0x3f;
 527                 char mid  = (c >> 6) & 0x3F;
 528                 char high = c >> 12;
 529                         left = left - 3;
 530                 if ((int) left < 0) break;
 531                 *pos++ = high | 0xE0;
 532                 *pos++ = mid  | 0x80;
 533                 *pos++ = low  | 0x80;
 534                 }
 535         }
 536
 537         /* insert utf-string into symbol-table */
 538         result = utf_new(buffer,buflength);
 539
 540         MFREE(buffer, char, buflength);
 541
 542         return result;
 543 }
 544
 545
 546 /* utf_new_char ****************************************************************
 547
 548    Creates a new utf symbol, the text for this symbol is passed as a
 549    c-string ( = char* ).
 550
 551 *******************************************************************************/
 552
 553 utf *utf_new_char(const char *text)
 554 {
 555         return utf_new(text, strlen(text));
 556 }
 557
 558
 559 /* utf_new_char_classname ******************************************************
 560
 561    Creates a new utf symbol, the text for this symbol is passed as a
 562    c-string ( = char* ) "." characters are going to be replaced by
 563    "/". Since the above function is used often, this is a separte
 564    function, instead of an if.
 565
 566 *******************************************************************************/
 567
 568 utf *utf_new_char_classname(const char *text)
 569 {
 570         if (strchr(text, '.')) {
 571                 char *txt = strdup(text);
 572                 char *end = txt + strlen(txt);
 573                 char *c;
 574                 utf *tmpRes;
 575
 576                 for (c = txt; c < end; c++)
 577                         if (*c == '.') *c = '/';
 578
 579                 tmpRes = utf_new(txt, strlen(txt));
 580                 FREE(txt, 0);
 581
 582                 return tmpRes;
 583
 584         } else
 585                 return utf_new(text, strlen(text));
 586 }
 587
 588
 589 /* utf_nextu2 ******************************************************************
 590
 591    Read the next unicode character from the utf string and increment
 592    the utf-string pointer accordingly.
 593
 594 *******************************************************************************/
 595
 596 u2 utf_nextu2(char **utf_ptr)
 597 {
 598     /* uncompressed unicode character */
 599     u2 unicode_char = 0;
 600     /* current position in utf text */
 601     unsigned char *utf = (unsigned char *) (*utf_ptr);
 602     /* bytes representing the unicode character */
 603     unsigned char ch1, ch2, ch3;
 604     /* number of bytes used to represent the unicode character */
 605     int len = 0;
 606
 607     switch ((ch1 = utf[0]) >> 4) {
 608         default: /* 1 byte */
 609                 (*utf_ptr)++;
 610                 return (u2) ch1;
 611         case 0xC:
 612         case 0xD: /* 2 bytes */
 613                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 614                         unsigned char high = ch1 & 0x1F;
 615                         unsigned char low  = ch2 & 0x3F;
 616                         unicode_char = (high << 6) + low;
 617                         len = 2;
 618                 }
 619                 break;
 620
 621         case 0xE: /* 2 or 3 bytes */
 622                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 623                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 624                                 unsigned char low  = ch3 & 0x3f;
 625                                 unsigned char mid  = ch2 & 0x3f;
 626                                 unsigned char high = ch1 & 0x0f;
 627                                 unicode_char = (((high << 6) + mid) << 6) + low;
 628                                 len = 3;
 629                         } else
 630                                 len = 2;
 631                 }
 632                 break;
 633     }
 634
 635     /* update position in utf-text */
 636     *utf_ptr = (char *) (utf + len);
 637
 638     return unicode_char;
 639 }
 640
 641
 642 /* utf_strlen ******************************************************************
 643
 644    Determine number of unicode characters in the utf string.
 645
 646 *******************************************************************************/
 647
 648 u4 utf_strlen(utf *u)
 649 {
 650         char *endpos;                       /* points behind utf string           */
 651         char *utf_ptr;                      /* current position in utf text       */
 652         u4 len = 0;                         /* number of unicode characters       */
 653
 654         if (!u) {
 655                 *exceptionptr = new_nullpointerexception();
 656                 return 0;
 657         }
 658
 659         endpos = utf_end(u);
 660         utf_ptr = u->text;
 661
 662         while (utf_ptr < endpos) {
 663                 len++;
 664                 /* next unicode character */
 665                 utf_nextu2(&utf_ptr);
 666         }
 667
 668         if (utf_ptr != endpos)
 669                 /* string ended abruptly */
 670                 throw_cacao_exception_exit(string_java_lang_InternalError,
 671                                                                    "Illegal utf8 string");
 672
 673         return len;
 674 }
 675
 676
 677 /* u2_utflength ****************************************************************
 678
 679    Returns the utf length in bytes of a u2 array.
 680
 681 *******************************************************************************/
 682
 683 u4 u2_utflength(u2 *text, u4 u2_length)
 684 {
 685         u4 result_len = 0;                  /* utf length in bytes                */
 686         u2 ch;                              /* current unicode character          */
 687         u4 len;
 688
 689         for (len = 0; len < u2_length; len++) {
 690                 /* next unicode character */
 691                 ch = *text++;
 692
 693                 /* determine bytes required to store unicode character as utf */
 694                 if (ch && (ch < 0x80))
 695                         result_len++;
 696                 else if (ch < 0x800)
 697                         result_len += 2;
 698                 else
 699                         result_len += 3;
 700         }
 701
 702     return result_len;
 703 }
 704
 705
 706 /* utf_display *****************************************************************
 707
 708    Write utf symbol to stdout (for debugging purposes).
 709
 710 *******************************************************************************/
 711
 712 void utf_display(utf *u)
 713 {
 714         char *endpos;                       /* points behind utf string           */
 715         char *utf_ptr;                      /* current position in utf text       */
 716
 717         if (!u) {
 718                 printf("NULL");
 719                 fflush(stdout);
 720                 return;
 721         }
 722
 723         endpos = utf_end(u);
 724         utf_ptr = u->text;
 725
 726         while (utf_ptr < endpos) {
 727                 /* read next unicode character */
 728                 u2 c = utf_nextu2(&utf_ptr);
 729                 if (c >= 32 && c <= 127) printf("%c", c);
 730                 else printf("?");
 731         }
 732
 733         fflush(stdout);
 734 }
 735
 736
 737 /* utf_display_classname *******************************************************
 738
 739    Write utf symbol to stdout with `/' converted to `.' (for debugging
 740    purposes).
 741
 742 *******************************************************************************/
 743
 744 void utf_display_classname(utf *u)
 745 {
 746         char *endpos;                       /* points behind utf string           */
 747         char *utf_ptr;                      /* current position in utf text       */
 748
 749         if (!u) {
 750                 printf("NULL");
 751                 fflush(stdout);
 752                 return;
 753         }
 754
 755         endpos = utf_end(u);
 756         utf_ptr = u->text;
 757
 758         while (utf_ptr < endpos) {
 759                 /* read next unicode character */
 760                 u2 c = utf_nextu2(&utf_ptr);
 761                 if (c == '/') c = '.';
 762                 if (c >= 32 && c <= 127) printf("%c", c);
 763                 else printf("?");
 764         }
 765
 766         fflush(stdout);
 767 }
 768
 769
 770 /* utf_sprint ******************************************************************
 771
 772    Write utf symbol into c-string (for debugging purposes).
 773
 774 *******************************************************************************/
 775
 776 void utf_sprint(char *buffer, utf *u)
 777 {
 778         char *endpos;                       /* points behind utf string           */
 779         char *utf_ptr;                      /* current position in utf text       */
 780         u2 pos = 0;                         /* position in c-string               */
 781
 782         if (!u) {
 783                 strcpy(buffer, "NULL");
 784                 return;
 785         }
 786
 787         endpos = utf_end(u);
 788         utf_ptr = u->text;
 789
 790         while (utf_ptr < endpos)
 791                 /* copy next unicode character */
 792                 buffer[pos++] = utf_nextu2(&utf_ptr);
 793
 794         /* terminate string */
 795         buffer[pos] = '\0';
 796 }
 797
 798
 799 /* utf_sprint_classname ********************************************************
 800
 801    Write utf symbol into c-string with `/' converted to `.' (for debugging
 802    purposes).
 803
 804 *******************************************************************************/
 805
 806 void utf_sprint_classname(char *buffer, utf *u)
 807 {
 808         char *endpos;                       /* points behind utf string           */
 809         char *utf_ptr;                      /* current position in utf text       */
 810         u2 pos = 0;                         /* position in c-string               */
 811
 812         if (!u) {
 813                 strcpy(buffer, "NULL");
 814                 return;
 815         }
 816
 817         endpos = utf_end(u);
 818         utf_ptr = u->text;
 819
 820         while (utf_ptr < endpos) {
 821                 /* copy next unicode character */
 822                 u2 c = utf_nextu2(&utf_ptr);
 823                 if (c == '/') c = '.';
 824                 buffer[pos++] = c;
 825         }
 826
 827         /* terminate string */
 828         buffer[pos] = '\0';
 829 }
 830
 831
 832 /* utf_strcat ******************************************************************
 833
 834    Like libc strcat, but uses an utf8 string.
 835
 836 *******************************************************************************/
 837
 838 void utf_strcat(char *buffer, utf *u)
 839 {
 840         utf_sprint(buffer + strlen(buffer), u);
 841 }
 842
 843
 844 /* utf_strcat_classname ********************************************************
 845
 846    Like libc strcat, but uses an utf8 string.
 847
 848 *******************************************************************************/
 849
 850 void utf_strcat_classname(char *buffer, utf *u)
 851 {
 852         utf_sprint_classname(buffer + strlen(buffer), u);
 853 }
 854
 855
 856 /* utf_fprint ******************************************************************
 857
 858    Write utf symbol into file.
 859
 860 *******************************************************************************/
 861
 862 void utf_fprint(FILE *file, utf *u)
 863 {
 864         char *endpos;                       /* points behind utf string           */
 865         char *utf_ptr;                      /* current position in utf text       */
 866
 867         if (!u)
 868                 return;
 869
 870         endpos = utf_end(u);
 871         utf_ptr = u->text;
 872
 873         while (utf_ptr < endpos) {
 874                 /* read next unicode character */
 875                 u2 c = utf_nextu2(&utf_ptr);
 876
 877                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 878                 else fprintf(file, "?");
 879         }
 880 }
 881
 882
 883 /* utf_fprint_classname ********************************************************
 884
 885    Write utf symbol into file with `/' converted to `.'.
 886
 887 *******************************************************************************/
 888
 889 void utf_fprint_classname(FILE *file, utf *u)
 890 {
 891         char *endpos;                       /* points behind utf string           */
 892         char *utf_ptr;                      /* current position in utf text       */
 893
 894     if (!u)
 895                 return;
 896
 897         endpos = utf_end(u);
 898         utf_ptr = u->text;
 899
 900         while (utf_ptr < endpos) {
 901                 /* read next unicode character */
 902                 u2 c = utf_nextu2(&utf_ptr);
 903                 if (c == '/') c = '.';
 904
 905                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 906                 else fprintf(file, "?");
 907         }
 908 }
 909
 910
 911 /* is_valid_utf ****************************************************************
 912
 913    Return true if the given string is a valid UTF-8 string.
 914
 915    utf_ptr...points to first character
 916    end_pos...points after last character
 917
 918 *******************************************************************************/
 919
 920 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 921
 922 bool is_valid_utf(char *utf_ptr, char *end_pos)
 923 {
 924         int bytes;
 925         int len,i;
 926         char c;
 927         unsigned long v;
 928
 929         if (end_pos < utf_ptr) return false;
 930         bytes = end_pos - utf_ptr;
 931         while (bytes--) {
 932                 c = *utf_ptr++;
 933
 934                 if (!c) return false;                     /* 0x00 is not allowed */
 935                 if ((c & 0x80) == 0) continue;            /* ASCII */
 936
 937                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 938                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 939                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 940                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 941                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 942                 else return false;                        /* invalid leading byte */
 943
 944                 if (len > 2) return false;                /* Java limitation */
 945
 946                 v = (unsigned long)c & (0x3f >> len);
 947
 948                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 949
 950                 for (i = len; i--; ) {
 951                         c = *utf_ptr++;
 952                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 953                                 return false;
 954                         v = (v << 6) | (c & 0x3f);
 955                 }
 956
 957                 if (v == 0) {
 958                         if (len != 1) return false;           /* Java special */
 959
 960                 } else {
 961                         /* Sun Java seems to allow overlong UTF-8 encodings */
 962
 963                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 964                                 if (!opt_liberalutf)
 965                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 966                                 /* XXX change this to exception? */
 967                         }
 968                 }
 969
 970                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 971                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 972
 973                 /* even these seem to be allowed */
 974                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 975         }
 976
 977         return true;
 978 }
 979
 980
 981 /* is_valid_name ***************************************************************
 982
 983    Return true if the given string may be used as a class/field/method
 984    name. (Currently this only disallows empty strings and control
 985    characters.)
 986
 987    NOTE: The string is assumed to have passed is_valid_utf!
 988
 989    utf_ptr...points to first character
 990    end_pos...points after last character
 991
 992 *******************************************************************************/
 993
 994 bool is_valid_name(char *utf_ptr, char *end_pos)
 995 {
 996         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 997
 998         while (utf_ptr < end_pos) {
 999                 unsigned char c = *utf_ptr++;
1000
1001                 if (c < 0x20) return false; /* disallow control characters */
1002                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1003                         return false;
1004         }
1005
1006         return true;
1007 }
1008
1009 bool is_valid_name_utf(utf *u)
1010 {
1011         return is_valid_name(u->text,utf_end(u));
1012 }
1013
1014
1015 /* utf_show ********************************************************************
1016
1017    Writes the utf symbols in the utfhash to stdout and displays the
1018    number of external hash chains grouped according to the chainlength
1019    (for debugging purposes).
1020
1021 *******************************************************************************/
1022
1023 void utf_show(void)
1024 {
1025
1026 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1027
1028         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1029         u4 max_chainlength = 0;      /* maximum length of the chains */
1030         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1031         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1032         u4 i;
1033
1034         printf ("UTF-HASH:\n");
1035
1036         /* show element of utf-hashtable */
1037         for (i=0; i<utf_hash.size; i++) {
1038                 utf *u = utf_hash.ptr[i];
1039                 if (u) {
1040                         printf ("SLOT %d: ", (int) i);
1041                         while (u) {
1042                                 printf ("'");
1043                                 utf_display (u);
1044                                 printf ("' ");
1045                                 u = u->hashlink;
1046                         }
1047                         printf ("\n");
1048                 }
1049
1050         }
1051
1052         printf ("UTF-HASH: %d slots for %d entries\n",
1053                         (int) utf_hash.size, (int) utf_hash.entries );
1054
1055
1056         if (utf_hash.entries == 0)
1057                 return;
1058
1059         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1060
1061         for (i=0;i<CHAIN_LIMIT;i++)
1062                 chain_count[i]=0;
1063
1064         /* count numbers of hashchains according to their length */
1065         for (i=0; i<utf_hash.size; i++) {
1066
1067                 utf *u = (utf*) utf_hash.ptr[i];
1068                 u4 chain_length = 0;
1069
1070                 /* determine chainlength */
1071                 while (u) {
1072                         u = u->hashlink;
1073                         chain_length++;
1074                 }
1075
1076                 /* update sum of all chainlengths */
1077                 sum_chainlength+=chain_length;
1078
1079                 /* determine the maximum length of the chains */
1080                 if (chain_length>max_chainlength)
1081                         max_chainlength = chain_length;
1082
1083                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1084                 if (chain_length>=CHAIN_LIMIT) {
1085                         beyond_limit+=chain_length;
1086                         chain_length=CHAIN_LIMIT-1;
1087                 }
1088
1089                 /* update number of hashchains of current length */
1090                 chain_count[chain_length]++;
1091         }
1092
1093         /* display results */
1094         for (i=1;i<CHAIN_LIMIT-1;i++)
1095                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1096
1097         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1098
1099
1100         printf("max. chainlength:%5d\n",max_chainlength);
1101
1102         /* avg. chainlength = sum of chainlengths / number of chains */
1103         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1104 }
1105
1106
1107 /*
1108  * These are local overrides for various environment variables in Emacs.
1109  * Please do not remove this and leave it at the end of the file, where
1110  * Emacs will automagically detect them.
1111  * ---------------------------------------------------------------------
1112  * Local variables:
1113  * mode: c
1114  * indent-tabs-mode: t
1115  * c-basic-offset: 4
1116  * tab-width: 4
1117  * End:
1118  */