src/vm/utf8.c

   1 /* src/vm/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    Contact: cacao@cacaojvm.org
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32                         Edwin Steiner
  33
  34    $Id: utf8.c 5920 2006-11-05 21:23:09Z twisti $
  35
  36 */
  37
  38
  39 #include "config.h"
  40
  41 #include <string.h>
  42 #include <assert.h>
  43
  44 #include "vm/types.h"
  45
  46 #include "mm/memory.h"
  47
  48 #if defined(ENABLE_THREADS)
  49 # include "threads/native/lock.h"
  50 #else
  51 # include "threads/none/lock.h"
  52 #endif
  53
  54 #include "vm/builtin.h"
  55 #include "vm/exceptions.h"
  56 #include "vm/hashtable.h"
  57 #include "vm/options.h"
  58 #include "vm/statistics.h"
  59 #include "vm/stringlocal.h"
  60 #include "vm/utf8.h"
  61
  62
  63 /* global variables ***********************************************************/
  64
  65 /* hashsize must be power of 2 */
  66
  67 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  68
  69 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  70
  71
  72 /* utf-symbols for pointer comparison of frequently used strings **************/
  73
  74 utf *utf_java_lang_Object;
  75
  76 utf *utf_java_lang_Class;
  77 utf *utf_java_lang_ClassLoader;
  78 utf *utf_java_lang_Cloneable;
  79 utf *utf_java_lang_SecurityManager;
  80 utf *utf_java_lang_String;
  81 utf *utf_java_lang_System;
  82 utf *utf_java_lang_ThreadGroup;
  83 utf *utf_java_io_Serializable;
  84
  85 utf *utf_java_lang_Throwable;
  86 utf *utf_java_lang_VMThrowable;
  87 utf *utf_java_lang_Error;
  88 utf *utf_java_lang_AbstractMethodError;
  89 utf *utf_java_lang_LinkageError;
  90 utf *utf_java_lang_NoClassDefFoundError;
  91 utf *utf_java_lang_NoSuchMethodError;
  92 utf *utf_java_lang_OutOfMemoryError;
  93
  94 utf *utf_java_lang_Exception;
  95 utf *utf_java_lang_ClassCastException;
  96 utf *utf_java_lang_ClassNotFoundException;
  97 utf *utf_java_lang_IllegalArgumentException;
  98 utf *utf_java_lang_IllegalMonitorStateException;
  99
 100 utf *utf_java_lang_NullPointerException;
 101
 102 utf* utf_java_lang_Void;
 103 utf* utf_java_lang_Boolean;
 104 utf* utf_java_lang_Byte;
 105 utf* utf_java_lang_Character;
 106 utf* utf_java_lang_Short;
 107 utf* utf_java_lang_Integer;
 108 utf* utf_java_lang_Long;
 109 utf* utf_java_lang_Float;
 110 utf* utf_java_lang_Double;
 111
 112 utf *utf_java_lang_StackTraceElement;
 113 utf *utf_java_lang_reflect_Constructor;
 114 utf *utf_java_lang_reflect_Field;
 115 utf *utf_java_lang_reflect_Method;
 116 utf *utf_java_util_Vector;
 117
 118 utf *utf_InnerClasses;                  /* InnerClasses                       */
 119 utf *utf_ConstantValue;                 /* ConstantValue                      */
 120 utf *utf_Code;                          /* Code                               */
 121 utf *utf_Exceptions;                    /* Exceptions                         */
 122 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 123 utf *utf_SourceFile;                    /* SourceFile                         */
 124 utf *utf_Signature;
 125
 126 utf *utf_init;                          /* <init>                             */
 127 utf *utf_clinit;                        /* <clinit>                           */
 128 utf *utf_clone;                         /* clone                              */
 129 utf *utf_finalize;                      /* finalize                           */
 130 utf *utf_run;                           /* run                                */
 131
 132 utf *utf_add;
 133 utf *utf_remove;
 134 utf *utf_removeThread;
 135 utf *utf_put;
 136 utf *utf_get;
 137 utf *utf_value;
 138
 139 utf *utf_fillInStackTrace;
 140 utf *utf_getSystemClassLoader;
 141 utf *utf_loadClass;
 142 utf *utf_printStackTrace;
 143
 144 utf *utf_Z;                             /* Z                                  */
 145 utf *utf_B;                             /* B                                  */
 146 utf *utf_C;                             /* C                                  */
 147 utf *utf_S;                             /* S                                  */
 148 utf *utf_I;                             /* I                                  */
 149 utf *utf_J;                             /* J                                  */
 150 utf *utf_F;                             /* F                                  */
 151 utf *utf_D;                             /* D                                  */
 152
 153 utf *utf_void__void;                    /* ()V                                */
 154 utf *utf_boolean__void;                 /* (Z)V                               */
 155 utf *utf_byte__void;                    /* (B)V                               */
 156 utf *utf_char__void;                    /* (C)V                               */
 157 utf *utf_short__void;                   /* (S)V                               */
 158 utf *utf_int__void;                     /* (I)V                               */
 159 utf *utf_long__void;                    /* (J)V                               */
 160 utf *utf_float__void;                   /* (F)V                               */
 161 utf *utf_double__void;                  /* (D)V                               */
 162
 163 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 164 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 165 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 166 utf *utf_java_lang_Object__java_lang_Object;
 167 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 168 utf *utf_java_lang_String__java_lang_Class;
 169 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 170 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 171
 172 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 173 utf *utf_null;
 174 utf *array_packagename;
 175
 176
 177 /* utf_init ********************************************************************
 178
 179    Initializes the utf8 subsystem.
 180
 181 *******************************************************************************/
 182
 183 bool utf8_init(void)
 184 {
 185         /* create utf8 hashtable */
 186
 187         hashtable_utf = NEW(hashtable);
 188
 189         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 190
 191 #if defined(ENABLE_STATISTICS)
 192         if (opt_stat)
 193                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 194 #endif
 195
 196         /* create utf-symbols for pointer comparison of frequently used strings */
 197
 198         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 199
 200         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 201         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 202         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 203         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 204         utf_java_lang_String           = utf_new_char("java/lang/String");
 205         utf_java_lang_System           = utf_new_char("java/lang/System");
 206         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 207         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 208
 209         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 210         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 211         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 212
 213         utf_java_lang_AbstractMethodError =
 214                 utf_new_char(string_java_lang_AbstractMethodError);
 215
 216         utf_java_lang_LinkageError =
 217                 utf_new_char(string_java_lang_LinkageError);
 218
 219         utf_java_lang_NoClassDefFoundError =
 220                 utf_new_char(string_java_lang_NoClassDefFoundError);
 221
 222         utf_java_lang_NoSuchMethodError =
 223                 utf_new_char(string_java_lang_NoSuchMethodError);
 224
 225         utf_java_lang_OutOfMemoryError =
 226                 utf_new_char(string_java_lang_OutOfMemoryError);
 227
 228         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 229
 230         utf_java_lang_ClassCastException =
 231                 utf_new_char(string_java_lang_ClassCastException);
 232
 233         utf_java_lang_ClassNotFoundException =
 234                 utf_new_char(string_java_lang_ClassNotFoundException);
 235
 236         utf_java_lang_IllegalArgumentException =
 237                 utf_new_char(string_java_lang_IllegalArgumentException);
 238
 239         utf_java_lang_IllegalMonitorStateException =
 240                 utf_new_char(string_java_lang_IllegalMonitorStateException);
 241
 242         utf_java_lang_NullPointerException =
 243                 utf_new_char(string_java_lang_NullPointerException);
 244
 245         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 246         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 247         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 248         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 249         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 250         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 251         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 252         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 253         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 254
 255         utf_java_lang_StackTraceElement =
 256                 utf_new_char("java/lang/StackTraceElement");
 257
 258         utf_java_lang_reflect_Constructor =
 259                 utf_new_char("java/lang/reflect/Constructor");
 260
 261         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 262         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 263         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 264
 265         utf_InnerClasses               = utf_new_char("InnerClasses");
 266         utf_ConstantValue              = utf_new_char("ConstantValue");
 267         utf_Code                       = utf_new_char("Code");
 268         utf_Exceptions                 = utf_new_char("Exceptions");
 269         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 270         utf_SourceFile                 = utf_new_char("SourceFile");
 271         utf_Signature                  = utf_new_char("Signature");
 272
 273         utf_init                           = utf_new_char("<init>");
 274         utf_clinit                         = utf_new_char("<clinit>");
 275         utf_clone                      = utf_new_char("clone");
 276         utf_finalize                   = utf_new_char("finalize");
 277         utf_run                        = utf_new_char("run");
 278
 279         utf_add                        = utf_new_char("add");
 280         utf_remove                     = utf_new_char("remove");
 281         utf_removeThread               = utf_new_char("removeThread");
 282         utf_put                        = utf_new_char("put");
 283         utf_get                        = utf_new_char("get");
 284         utf_value                      = utf_new_char("value");
 285
 286         utf_printStackTrace            = utf_new_char("printStackTrace");
 287         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 288         utf_loadClass                  = utf_new_char("loadClass");
 289         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 290
 291         utf_Z                          = utf_new_char("Z");
 292         utf_B                          = utf_new_char("B");
 293         utf_C                          = utf_new_char("C");
 294         utf_S                          = utf_new_char("S");
 295         utf_I                          = utf_new_char("I");
 296         utf_J                          = utf_new_char("J");
 297         utf_F                          = utf_new_char("F");
 298         utf_D                          = utf_new_char("D");
 299
 300         utf_void__void                 = utf_new_char("()V");
 301         utf_boolean__void              = utf_new_char("(Z)V");
 302         utf_byte__void                 = utf_new_char("(B)V");
 303         utf_char__void                 = utf_new_char("(C)V");
 304         utf_short__void                = utf_new_char("(S)V");
 305         utf_int__void                  = utf_new_char("(I)V");
 306         utf_long__void                 = utf_new_char("(J)V");
 307         utf_float__void                = utf_new_char("(F)V");
 308         utf_double__void               = utf_new_char("(D)V");
 309         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 310         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 311
 312         utf_void__java_lang_ClassLoader =
 313                 utf_new_char("()Ljava/lang/ClassLoader;");
 314
 315         utf_java_lang_Object__java_lang_Object =
 316                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 317
 318         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 319
 320         utf_java_lang_String__java_lang_Class =
 321                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 322
 323         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 324         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 325
 326         utf_null                       = utf_new_char("null");
 327         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 328         array_packagename              = utf_new_char("\t<the array package>");
 329
 330         /* everything's ok */
 331
 332         return true;
 333 }
 334
 335
 336 /* utf_hashkey *****************************************************************
 337
 338    The hashkey is computed from the utf-text by using up to 8
 339    characters.  For utf-symbols longer than 15 characters 3 characters
 340    are taken from the beginning and the end, 2 characters are taken
 341    from the middle.
 342
 343 *******************************************************************************/
 344
 345 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 346 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 347
 348 u4 utf_hashkey(const char *text, u4 length)
 349 {
 350         const char *start_pos = text;       /* pointer to utf text                */
 351         u4 a;
 352
 353         switch (length) {
 354         case 0: /* empty string */
 355                 return 0;
 356
 357         case 1: return fbs(0);
 358         case 2: return fbs(0) ^ nbs(3);
 359         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 360         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 361         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 362         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 363         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 364         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 365
 366         case 9:
 367                 a = fbs(0);
 368                 a ^= nbs(1);
 369                 a ^= nbs(2);
 370                 text++;
 371                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 372
 373         case 10:
 374                 a = fbs(0);
 375                 text++;
 376                 a ^= nbs(2);
 377                 a ^= nbs(3);
 378                 a ^= nbs(4);
 379                 text++;
 380                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 381
 382         case 11:
 383                 a = fbs(0);
 384                 text++;
 385                 a ^= nbs(2);
 386                 a ^= nbs(3);
 387                 a ^= nbs(4);
 388                 text++;
 389                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 390
 391         case 12:
 392                 a = fbs(0);
 393                 text += 2;
 394                 a ^= nbs(2);
 395                 a ^= nbs(3);
 396                 text++;
 397                 a ^= nbs(5);
 398                 a ^= nbs(6);
 399                 a ^= nbs(7);
 400                 text++;
 401                 return a ^ nbs(9) ^ nbs(10);
 402
 403         case 13:
 404                 a = fbs(0);
 405                 a ^= nbs(1);
 406                 text++;
 407                 a ^= nbs(3);
 408                 a ^= nbs(4);
 409                 text += 2;
 410                 a ^= nbs(7);
 411                 a ^= nbs(8);
 412                 text += 2;
 413                 return a ^ nbs(9) ^ nbs(10);
 414
 415         case 14:
 416                 a = fbs(0);
 417                 text += 2;
 418                 a ^= nbs(3);
 419                 a ^= nbs(4);
 420                 text += 2;
 421                 a ^= nbs(7);
 422                 a ^= nbs(8);
 423                 text += 2;
 424                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 425
 426         case 15:
 427                 a = fbs(0);
 428                 text += 2;
 429                 a ^= nbs(3);
 430                 a ^= nbs(4);
 431                 text += 2;
 432                 a ^= nbs(7);
 433                 a ^= nbs(8);
 434                 text += 2;
 435                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 436
 437         default:  /* 3 characters from beginning */
 438                 a = fbs(0);
 439                 text += 2;
 440                 a ^= nbs(3);
 441                 a ^= nbs(4);
 442
 443                 /* 2 characters from middle */
 444                 text = start_pos + (length / 2);
 445                 a ^= fbs(5);
 446                 text += 2;
 447                 a ^= nbs(6);
 448
 449                 /* 3 characters from end */
 450                 text = start_pos + length - 4;
 451
 452                 a ^= fbs(7);
 453                 text++;
 454
 455                 return a ^ nbs(10) ^ nbs(11);
 456     }
 457 }
 458
 459 /* utf_full_hashkey ************************************************************
 460
 461    This function computes a hash value using all bytes in the string.
 462
 463    The algorithm is the "One-at-a-time" algorithm as published
 464    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 465
 466 *******************************************************************************/
 467
 468 u4 utf_full_hashkey(const char *text, u4 length)
 469 {
 470         register const unsigned char *p = (const unsigned char *) text;
 471         register u4 hash;
 472         register u4 i;
 473
 474         hash = 0;
 475         for (i=length; i--;)
 476         {
 477             hash += *p++;
 478             hash += (hash << 10);
 479             hash ^= (hash >> 6);
 480         }
 481         hash += (hash << 3);
 482         hash ^= (hash >> 11);
 483         hash += (hash << 15);
 484
 485         return hash;
 486 }
 487
 488 /* unicode_hashkey *************************************************************
 489
 490    Compute the hashkey of a unicode string.
 491
 492 *******************************************************************************/
 493
 494 u4 unicode_hashkey(u2 *text, u2 len)
 495 {
 496         return utf_hashkey((char *) text, len);
 497 }
 498
 499
 500 /* utf_new *********************************************************************
 501
 502    Creates a new utf-symbol, the text of the symbol is passed as a
 503    u1-array. The function searches the utf-hashtable for a utf-symbol
 504    with this text. On success the element returned, otherwise a new
 505    hashtable element is created.
 506
 507    If the number of entries in the hashtable exceeds twice the size of
 508    the hashtable slots a reorganization of the hashtable is done and
 509    the utf symbols are copied to a new hashtable with doubled size.
 510
 511 *******************************************************************************/
 512
 513 utf *utf_new(const char *text, u2 length)
 514 {
 515         u4 key;                             /* hashkey computed from utf-text     */
 516         u4 slot;                            /* slot in hashtable                  */
 517         utf *u;                             /* hashtable element                  */
 518         u2 i;
 519
 520         LOCK_MONITOR_ENTER(hashtable_utf->header);
 521
 522 #if defined(ENABLE_STATISTICS)
 523         if (opt_stat)
 524                 count_utf_new++;
 525 #endif
 526
 527         key  = utf_hashkey(text, length);
 528         slot = key & (hashtable_utf->size - 1);
 529         u    = hashtable_utf->ptr[slot];
 530
 531         /* search external hash chain for utf-symbol */
 532
 533         while (u) {
 534                 if (u->blength == length) {
 535                         /* compare text of hashtable elements */
 536
 537                         for (i = 0; i < length; i++)
 538                                 if (text[i] != u->text[i])
 539                                         goto nomatch;
 540
 541 #if defined(ENABLE_STATISTICS)
 542                         if (opt_stat)
 543                                 count_utf_new_found++;
 544 #endif
 545
 546                         /* symbol found in hashtable */
 547
 548                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 549
 550                         return u;
 551                 }
 552
 553         nomatch:
 554                 u = u->hashlink; /* next element in external chain */
 555         }
 556
 557 #if defined(ENABLE_STATISTICS)
 558         if (opt_stat)
 559                 count_utf_len += sizeof(utf) + length + 1;
 560 #endif
 561
 562         /* location in hashtable found, create new utf element */
 563         u = NEW(utf);
 564         u->blength  = length;               /* length in bytes of utfstring       */
 565         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 566         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 567
 568         memcpy(u->text, text, length);      /* copy utf-text                      */
 569         u->text[length] = '\0';
 570
 571         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 572         hashtable_utf->entries++;           /* update number of entries           */
 573
 574         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 575
 576         /* reorganization of hashtable, average length of the external
 577            chains is approx. 2 */
 578
 579                 hashtable *newhash;                              /* the new hashtable */
 580                 u4         i;
 581                 utf       *u;
 582                 utf       *nextu;
 583                 u4         slot;
 584
 585                 /* create new hashtable, double the size */
 586
 587                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 588
 589 #if defined(ENABLE_STATISTICS)
 590                 if (opt_stat)
 591                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 592 #endif
 593
 594                 /* transfer elements to new hashtable */
 595
 596                 for (i = 0; i < hashtable_utf->size; i++) {
 597                         u = hashtable_utf->ptr[i];
 598
 599                         while (u) {
 600                                 nextu = u->hashlink;
 601                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 602
 603                                 u->hashlink = (utf *) newhash->ptr[slot];
 604                                 newhash->ptr[slot] = u;
 605
 606                                 /* follow link in external hash chain */
 607
 608                                 u = nextu;
 609                         }
 610                 }
 611
 612                 /* dispose old table */
 613
 614                 hashtable_free(hashtable_utf);
 615
 616                 hashtable_utf = newhash;
 617         }
 618
 619         LOCK_MONITOR_EXIT(hashtable_utf->header);
 620
 621         return u;
 622 }
 623
 624
 625 /* utf_new_u2 ******************************************************************
 626
 627    Make utf symbol from u2 array, if isclassname is true '.' is
 628    replaced by '/'.
 629
 630 *******************************************************************************/
 631
 632 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 633 {
 634         char *buffer;                   /* memory buffer for  unicode characters  */
 635         char *pos;                      /* pointer to current position in buffer  */
 636         u4 left;                        /* unicode characters left                */
 637         u4 buflength;                   /* utf length in bytes of the u2 array    */
 638         utf *result;                    /* resulting utf-string                   */
 639         int i;
 640
 641         /* determine utf length in bytes and allocate memory */
 642
 643         buflength = u2_utflength(unicode_pos, unicode_length);
 644         buffer    = MNEW(char, buflength);
 645
 646         left = buflength;
 647         pos  = buffer;
 648
 649         for (i = 0; i++ < unicode_length; unicode_pos++) {
 650                 /* next unicode character */
 651                 u2 c = *unicode_pos;
 652
 653                 if ((c != 0) && (c < 0x80)) {
 654                         /* 1 character */
 655                         left--;
 656                 if ((int) left < 0) break;
 657                         /* convert classname */
 658                         if (isclassname && c == '.')
 659                                 *pos++ = '/';
 660                         else
 661                                 *pos++ = (char) c;
 662
 663                 } else if (c < 0x800) {
 664                         /* 2 characters */
 665                 unsigned char high = c >> 6;
 666                 unsigned char low  = c & 0x3F;
 667                         left = left - 2;
 668                 if ((int) left < 0) break;
 669                 *pos++ = high | 0xC0;
 670                 *pos++ = low  | 0x80;
 671
 672                 } else {
 673                 /* 3 characters */
 674                 char low  = c & 0x3f;
 675                 char mid  = (c >> 6) & 0x3F;
 676                 char high = c >> 12;
 677                         left = left - 3;
 678                 if ((int) left < 0) break;
 679                 *pos++ = high | 0xE0;
 680                 *pos++ = mid  | 0x80;
 681                 *pos++ = low  | 0x80;
 682                 }
 683         }
 684
 685         /* insert utf-string into symbol-table */
 686         result = utf_new(buffer,buflength);
 687
 688         MFREE(buffer, char, buflength);
 689
 690         return result;
 691 }
 692
 693
 694 /* utf_new_char ****************************************************************
 695
 696    Creates a new utf symbol, the text for this symbol is passed as a
 697    c-string ( = char* ).
 698
 699 *******************************************************************************/
 700
 701 utf *utf_new_char(const char *text)
 702 {
 703         return utf_new(text, strlen(text));
 704 }
 705
 706
 707 /* utf_new_char_classname ******************************************************
 708
 709    Creates a new utf symbol, the text for this symbol is passed as a
 710    c-string ( = char* ) "." characters are going to be replaced by
 711    "/". Since the above function is used often, this is a separte
 712    function, instead of an if.
 713
 714 *******************************************************************************/
 715
 716 utf *utf_new_char_classname(const char *text)
 717 {
 718         if (strchr(text, '.')) {
 719                 char *txt = strdup(text);
 720                 char *end = txt + strlen(txt);
 721                 char *c;
 722                 utf *tmpRes;
 723
 724                 for (c = txt; c < end; c++)
 725                         if (*c == '.') *c = '/';
 726
 727                 tmpRes = utf_new(txt, strlen(txt));
 728                 FREE(txt, 0);
 729
 730                 return tmpRes;
 731
 732         } else
 733                 return utf_new(text, strlen(text));
 734 }
 735
 736
 737 /* utf_nextu2 ******************************************************************
 738
 739    Read the next unicode character from the utf string and increment
 740    the utf-string pointer accordingly.
 741
 742    CAUTION: This function is unsafe for input that was not checked
 743             by is_valid_utf!
 744
 745 *******************************************************************************/
 746
 747 u2 utf_nextu2(char **utf_ptr)
 748 {
 749     /* uncompressed unicode character */
 750     u2 unicode_char = 0;
 751     /* current position in utf text */
 752     unsigned char *utf = (unsigned char *) (*utf_ptr);
 753     /* bytes representing the unicode character */
 754     unsigned char ch1, ch2, ch3;
 755     /* number of bytes used to represent the unicode character */
 756     int len = 0;
 757
 758     switch ((ch1 = utf[0]) >> 4) {
 759         default: /* 1 byte */
 760                 (*utf_ptr)++;
 761                 return (u2) ch1;
 762         case 0xC:
 763         case 0xD: /* 2 bytes */
 764                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 765                         unsigned char high = ch1 & 0x1F;
 766                         unsigned char low  = ch2 & 0x3F;
 767                         unicode_char = (high << 6) + low;
 768                         len = 2;
 769                 }
 770                 break;
 771
 772         case 0xE: /* 2 or 3 bytes */
 773                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 774                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 775                                 unsigned char low  = ch3 & 0x3f;
 776                                 unsigned char mid  = ch2 & 0x3f;
 777                                 unsigned char high = ch1 & 0x0f;
 778                                 unicode_char = (((high << 6) + mid) << 6) + low;
 779                                 len = 3;
 780                         } else
 781                                 len = 2;
 782                 }
 783                 break;
 784     }
 785
 786     /* update position in utf-text */
 787     *utf_ptr = (char *) (utf + len);
 788
 789     return unicode_char;
 790 }
 791
 792
 793 /* utf_bytes *******************************************************************
 794
 795    Determine number of bytes (aka. octets) in the utf string.
 796
 797    IN:
 798       u............utf string
 799
 800    OUT:
 801       The number of octets of this utf string.
 802           There is _no_ terminating zero included in this count.
 803
 804 *******************************************************************************/
 805
 806 u4 utf_bytes(utf *u)
 807 {
 808         return u->blength;
 809 }
 810
 811 /* utf_get_number_of_u2s_for_buffer ********************************************
 812
 813    Determine number of UTF-16 u2s in the given UTF-8 buffer
 814
 815    CAUTION: This function is unsafe for input that was not checked
 816             by is_valid_utf!
 817
 818    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 819    to an array of u2s (UTF-16) and want to know how many of them you will get.
 820    All other uses of this function are probably wrong.
 821
 822    IN:
 823       buffer........points to first char in buffer
 824           blength.......number of _bytes_ in the buffer
 825
 826    OUT:
 827       the number of u2s needed to hold this string in UTF-16 encoding.
 828           There is _no_ terminating zero included in this count.
 829
 830    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 831    exception.
 832
 833 *******************************************************************************/
 834
 835 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 836 {
 837         const char *endpos;                 /* points behind utf string           */
 838         const char *utf_ptr;                /* current position in utf text       */
 839         u4 len = 0;                         /* number of unicode characters       */
 840
 841         utf_ptr = buffer;
 842         endpos = utf_ptr + blength;
 843
 844         while (utf_ptr < endpos) {
 845                 len++;
 846                 /* next unicode character */
 847                 utf_nextu2((char **)&utf_ptr);
 848         }
 849
 850         assert(utf_ptr == endpos);
 851
 852         return len;
 853 }
 854
 855
 856 /* utf_get_number_of_u2s *******************************************************
 857
 858    Determine number of UTF-16 u2s in the utf string.
 859
 860    CAUTION: This function is unsafe for input that was not checked
 861             by is_valid_utf!
 862
 863    CAUTION: Use this function *only* when you want to convert a utf string
 864    to an array of u2s and want to know how many of them you will get.
 865    All other uses of this function are probably wrong.
 866
 867    IN:
 868       u............utf string
 869
 870    OUT:
 871       the number of u2s needed to hold this string in UTF-16 encoding.
 872           There is _no_ terminating zero included in this count.
 873           XXX 0 if a NullPointerException has been thrown (see below)
 874
 875 *******************************************************************************/
 876
 877 u4 utf_get_number_of_u2s(utf *u)
 878 {
 879         char *endpos;                       /* points behind utf string           */
 880         char *utf_ptr;                      /* current position in utf text       */
 881         u4 len = 0;                         /* number of unicode characters       */
 882
 883         /* XXX this is probably not checked by most callers! Review this after */
 884         /* the invalid uses of this function have been eliminated */
 885         if (!u) {
 886                 exceptions_throw_nullpointerexception();
 887                 return 0;
 888         }
 889
 890         endpos = UTF_END(u);
 891         utf_ptr = u->text;
 892
 893         while (utf_ptr < endpos) {
 894                 len++;
 895                 /* next unicode character */
 896                 utf_nextu2(&utf_ptr);
 897         }
 898
 899         if (utf_ptr != endpos)
 900                 /* string ended abruptly */
 901                 throw_cacao_exception_exit(string_java_lang_InternalError,
 902                                                                    "Illegal utf8 string");
 903
 904         return len;
 905 }
 906
 907
 908 /* utf8_safe_number_of_u2s *****************************************************
 909
 910    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
 911    (For invalid UTF-8 the U+fffd replacement character will be counted.)
 912
 913    This function is safe even for invalid UTF-8 strings.
 914
 915    IN:
 916       text..........zero-terminated(!) UTF-8 string (may be invalid)
 917                         must NOT be NULL
 918           nbytes........strlen(text). (This is needed to completely emulate
 919                         the RI).
 920
 921    OUT:
 922       the number of u2s needed to hold this string in UTF-16 encoding.
 923           There is _no_ terminating zero included in this count.
 924
 925 *******************************************************************************/
 926
 927 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
 928         register const unsigned char *t;
 929         register s4 byte;
 930         register s4 len;
 931         register const unsigned char *tlimit;
 932         s4 byte1;
 933         s4 byte2;
 934         s4 byte3;
 935         s4 value;
 936         s4 skip;
 937
 938         assert(text);
 939         assert(nbytes >= 0);
 940
 941         len = 0;
 942         t = (const unsigned char *) text;
 943         tlimit = t + nbytes;
 944
 945         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
 946
 947         while (1) {
 948                 byte = *t++;
 949
 950                 if (byte & 0x80) {
 951                         /* highest bit set, non-ASCII character */
 952
 953                         if ((byte & 0xe0) == 0xc0) {
 954                                 /* 2-byte: should be 110..... 10...... ? */
 955
 956                                 if ((*t++ & 0xc0) == 0x80)
 957                                         ; /* valid 2-byte */
 958                                 else
 959                                         t--; /* invalid */
 960                         }
 961                         else if ((byte & 0xf0) == 0xe0) {
 962                                 /* 3-byte: should be 1110.... 10...... 10...... */
 963                                 /*                            ^t                */
 964
 965                                 if (t + 2 > tlimit)
 966                                         return len + 1; /* invalid, stop here */
 967
 968                                 if ((*t++ & 0xc0) == 0x80) {
 969                                         if ((*t++ & 0xc0) == 0x80)
 970                                                 ; /* valid 3-byte */
 971                                         else
 972                                                 t--; /* invalid */
 973                                 }
 974                                 else
 975                                         t--; /* invalid */
 976                         }
 977                         else if ((byte & 0xf8) == 0xf0) {
 978                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
 979                                 /*                            ^t                         */
 980
 981                                 if (t + 3 > tlimit)
 982                                         return len + 1; /* invalid, stop here */
 983
 984                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
 985                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
 986                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
 987                                                         /* valid 4-byte UTF-8? */
 988                                                         value = ((byte  & 0x07) << 18)
 989                                                                   | ((byte1 & 0x3f) << 12)
 990                                                                   | ((byte2 & 0x3f) <<  6)
 991                                                                   | ((byte3 & 0x3f)      );
 992
 993                                                         if (value > 0x10FFFF)
 994                                                                 ; /* invalid */
 995                                                         else if (value > 0xFFFF)
 996                                                                 len += 1; /* we need surrogates */
 997                                                         else
 998                                                                 ; /* 16bit suffice */
 999                                                 }
1000                                                 else
1001                                                         t--; /* invalid */
1002                                         }
1003                                         else
1004                                                 t--; /* invalid */
1005                                 }
1006                                 else
1007                                         t--; /* invalid */
1008                         }
1009                         else if ((byte & 0xfc) == 0xf8) {
1010                                 /* invalid 5-byte */
1011                                 if (t + 4 > tlimit)
1012                                         return len + 1; /* invalid, stop here */
1013
1014                                 skip = 4;
1015                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1016                                         t++;
1017                         }
1018                         else if ((byte & 0xfe) == 0xfc) {
1019                                 /* invalid 6-byte */
1020                                 if (t + 5 > tlimit)
1021                                         return len + 1; /* invalid, stop here */
1022
1023                                 skip = 5;
1024                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1025                                         t++;
1026                         }
1027                         else
1028                                 ; /* invalid */
1029                 }
1030                 else {
1031                         /* NUL */
1032
1033                         if (byte == 0)
1034                                 break;
1035
1036                         /* ASCII character, common case */
1037                 }
1038
1039                 len++;
1040         }
1041
1042         return len;
1043 }
1044
1045
1046 /* utf8_safe_convert_to_u2s ****************************************************
1047
1048    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1049    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1050    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1051
1052    This function is safe even for invalid UTF-8 strings.
1053
1054    IN:
1055       text..........zero-terminated(!) UTF-8 string (may be invalid)
1056                         must NOT be NULL
1057           nbytes........strlen(text). (This is needed to completely emulate
1058                                         the RI).
1059           buffer........a preallocated array of u2s to receive the decoded
1060                         string. Use utf8_safe_number_of_u2s to get the
1061                                         required number of u2s for allocating this.
1062
1063 *******************************************************************************/
1064
1065 #define UNICODE_REPLACEMENT  0xfffd
1066
1067 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1068         register const unsigned char *t;
1069         register s4 byte;
1070         register const unsigned char *tlimit;
1071         s4 byte1;
1072         s4 byte2;
1073         s4 byte3;
1074         s4 value;
1075         s4 skip;
1076
1077         assert(text);
1078         assert(nbytes >= 0);
1079
1080         t = (const unsigned char *) text;
1081         tlimit = t + nbytes;
1082
1083         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1084
1085         while (1) {
1086                 byte = *t++;
1087
1088                 if (byte & 0x80) {
1089                         /* highest bit set, non-ASCII character */
1090
1091                         if ((byte & 0xe0) == 0xc0) {
1092                                 /* 2-byte: should be 110..... 10...... */
1093
1094                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1095                                         /* valid 2-byte UTF-8 */
1096                                         *buffer++ = ((byte  & 0x1f) << 6)
1097                                                           | ((byte1 & 0x3f)     );
1098                                 }
1099                                 else {
1100                                         *buffer++ = UNICODE_REPLACEMENT;
1101                                         t--;
1102                                 }
1103                         }
1104                         else if ((byte & 0xf0) == 0xe0) {
1105                                 /* 3-byte: should be 1110.... 10...... 10...... */
1106
1107                                 if (t + 2 > tlimit) {
1108                                         *buffer++ = UNICODE_REPLACEMENT;
1109                                         return;
1110                                 }
1111
1112                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1113                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1114                                                 /* valid 3-byte UTF-8 */
1115                                                 *buffer++ = ((byte  & 0x0f) << 12)
1116                                                                   | ((byte1 & 0x3f) <<  6)
1117                                                                   | ((byte2 & 0x3f)      );
1118                                         }
1119                                         else {
1120                                                 *buffer++ = UNICODE_REPLACEMENT;
1121                                                 t--;
1122                                         }
1123                                 }
1124                                 else {
1125                                         *buffer++ = UNICODE_REPLACEMENT;
1126                                         t--;
1127                                 }
1128                         }
1129                         else if ((byte & 0xf8) == 0xf0) {
1130                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1131
1132                                 if (t + 3 > tlimit) {
1133                                         *buffer++ = UNICODE_REPLACEMENT;
1134                                         return;
1135                                 }
1136
1137                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1138                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1139                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1140                                                         /* valid 4-byte UTF-8? */
1141                                                         value = ((byte  & 0x07) << 18)
1142                                                                   | ((byte1 & 0x3f) << 12)
1143                                                                   | ((byte2 & 0x3f) <<  6)
1144                                                                   | ((byte3 & 0x3f)      );
1145
1146                                                         if (value > 0x10FFFF) {
1147                                                                 *buffer++ = UNICODE_REPLACEMENT;
1148                                                         }
1149                                                         else if (value > 0xFFFF) {
1150                                                                 /* we need surrogates */
1151                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1152                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1153                                                         }
1154                                                         else
1155                                                                 *buffer++ = value; /* 16bit suffice */
1156                                                 }
1157                                                 else {
1158                                                         *buffer++ = UNICODE_REPLACEMENT;
1159                                                         t--;
1160                                                 }
1161                                         }
1162                                         else {
1163                                                 *buffer++ = UNICODE_REPLACEMENT;
1164                                                 t--;
1165                                         }
1166                                 }
1167                                 else {
1168                                         *buffer++ = UNICODE_REPLACEMENT;
1169                                         t--;
1170                                 }
1171                         }
1172                         else if ((byte & 0xfc) == 0xf8) {
1173                                 if (t + 4 > tlimit) {
1174                                         *buffer++ = UNICODE_REPLACEMENT;
1175                                         return;
1176                                 }
1177
1178                                 skip = 4;
1179                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1180                                         t++;
1181                                 *buffer++ = UNICODE_REPLACEMENT;
1182                         }
1183                         else if ((byte & 0xfe) == 0xfc) {
1184                                 if (t + 5 > tlimit) {
1185                                         *buffer++ = UNICODE_REPLACEMENT;
1186                                         return;
1187                                 }
1188
1189                                 skip = 5;
1190                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1191                                         t++;
1192                                 *buffer++ = UNICODE_REPLACEMENT;
1193                         }
1194                         else
1195                                 *buffer++ = UNICODE_REPLACEMENT;
1196                 }
1197                 else {
1198                         /* NUL */
1199
1200                         if (byte == 0)
1201                                 break;
1202
1203                         /* ASCII character, common case */
1204
1205                         *buffer++ = byte;
1206                 }
1207         }
1208 }
1209
1210
1211 /* u2_utflength ****************************************************************
1212
1213    Returns the utf length in bytes of a u2 array.
1214
1215 *******************************************************************************/
1216
1217 u4 u2_utflength(u2 *text, u4 u2_length)
1218 {
1219         u4 result_len = 0;                  /* utf length in bytes                */
1220         u2 ch;                              /* current unicode character          */
1221         u4 len;
1222
1223         for (len = 0; len < u2_length; len++) {
1224                 /* next unicode character */
1225                 ch = *text++;
1226
1227                 /* determine bytes required to store unicode character as utf */
1228                 if (ch && (ch < 0x80))
1229                         result_len++;
1230                 else if (ch < 0x800)
1231                         result_len += 2;
1232                 else
1233                         result_len += 3;
1234         }
1235
1236     return result_len;
1237 }
1238
1239
1240 /* utf_copy ********************************************************************
1241
1242    Copy the given utf string byte-for-byte to a buffer.
1243
1244    IN:
1245       buffer.......the buffer
1246           u............the utf string
1247
1248 *******************************************************************************/
1249
1250 void utf_copy(char *buffer, utf *u)
1251 {
1252         /* our utf strings are zero-terminated (done by utf_new) */
1253         MCOPY(buffer, u->text, char, u->blength + 1);
1254 }
1255
1256
1257 /* utf_cat *********************************************************************
1258
1259    Append the given utf string byte-for-byte to a buffer.
1260
1261    IN:
1262       buffer.......the buffer
1263           u............the utf string
1264
1265 *******************************************************************************/
1266
1267 void utf_cat(char *buffer, utf *u)
1268 {
1269         /* our utf strings are zero-terminated (done by utf_new) */
1270         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1271 }
1272
1273
1274 /* utf_copy_classname **********************************************************
1275
1276    Copy the given utf classname byte-for-byte to a buffer.
1277    '/' is replaced by '.'
1278
1279    IN:
1280       buffer.......the buffer
1281           u............the utf string
1282
1283 *******************************************************************************/
1284
1285 void utf_copy_classname(char *buffer, utf *u)
1286 {
1287         char *bufptr;
1288         char *srcptr;
1289         char *endptr;
1290         char ch;
1291
1292         bufptr = buffer;
1293         srcptr = u->text;
1294         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1295
1296         while (srcptr != endptr) {
1297                 ch = *srcptr++;
1298                 if (ch == '/')
1299                         ch = '.';
1300                 *bufptr++ = ch;
1301         }
1302 }
1303
1304
1305 /* utf_cat *********************************************************************
1306
1307    Append the given utf classname byte-for-byte to a buffer.
1308    '/' is replaced by '.'
1309
1310    IN:
1311       buffer.......the buffer
1312           u............the utf string
1313
1314 *******************************************************************************/
1315
1316 void utf_cat_classname(char *buffer, utf *u)
1317 {
1318         utf_copy_classname(buffer + strlen(buffer), u);
1319 }
1320
1321 /* utf_display_printable_ascii *************************************************
1322
1323    Write utf symbol to stdout (for debugging purposes).
1324    Non-printable and non-ASCII characters are printed as '?'.
1325
1326 *******************************************************************************/
1327
1328 void utf_display_printable_ascii(utf *u)
1329 {
1330         char *endpos;                       /* points behind utf string           */
1331         char *utf_ptr;                      /* current position in utf text       */
1332
1333         if (u == NULL) {
1334                 printf("NULL");
1335                 fflush(stdout);
1336                 return;
1337         }
1338
1339         endpos = UTF_END(u);
1340         utf_ptr = u->text;
1341
1342         while (utf_ptr < endpos) {
1343                 /* read next unicode character */
1344
1345                 u2 c = utf_nextu2(&utf_ptr);
1346
1347                 if ((c >= 32) && (c <= 127))
1348                         printf("%c", c);
1349                 else
1350                         printf("?");
1351         }
1352
1353         fflush(stdout);
1354 }
1355
1356
1357 /* utf_display_printable_ascii_classname ***************************************
1358
1359    Write utf symbol to stdout with `/' converted to `.' (for debugging
1360    purposes).
1361    Non-printable and non-ASCII characters are printed as '?'.
1362
1363 *******************************************************************************/
1364
1365 void utf_display_printable_ascii_classname(utf *u)
1366 {
1367         char *endpos;                       /* points behind utf string           */
1368         char *utf_ptr;                      /* current position in utf text       */
1369
1370         if (u == NULL) {
1371                 printf("NULL");
1372                 fflush(stdout);
1373                 return;
1374         }
1375
1376         endpos = UTF_END(u);
1377         utf_ptr = u->text;
1378
1379         while (utf_ptr < endpos) {
1380                 /* read next unicode character */
1381
1382                 u2 c = utf_nextu2(&utf_ptr);
1383
1384                 if (c == '/')
1385                         c = '.';
1386
1387                 if ((c >= 32) && (c <= 127))
1388                         printf("%c", c);
1389                 else
1390                         printf("?");
1391         }
1392
1393         fflush(stdout);
1394 }
1395
1396
1397 /* utf_sprint_convert_to_latin1 ************************************************
1398
1399    Write utf symbol into c-string (for debugging purposes).
1400    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1401    invalid results.
1402
1403 *******************************************************************************/
1404
1405 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1406 {
1407         char *endpos;                       /* points behind utf string           */
1408         char *utf_ptr;                      /* current position in utf text       */
1409         u2 pos = 0;                         /* position in c-string               */
1410
1411         if (!u) {
1412                 strcpy(buffer, "NULL");
1413                 return;
1414         }
1415
1416         endpos = UTF_END(u);
1417         utf_ptr = u->text;
1418
1419         while (utf_ptr < endpos)
1420                 /* copy next unicode character */
1421                 buffer[pos++] = utf_nextu2(&utf_ptr);
1422
1423         /* terminate string */
1424         buffer[pos] = '\0';
1425 }
1426
1427
1428 /* utf_sprint_convert_to_latin1_classname **************************************
1429
1430    Write utf symbol into c-string with `/' converted to `.' (for debugging
1431    purposes).
1432    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1433    invalid results.
1434
1435 *******************************************************************************/
1436
1437 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1438 {
1439         char *endpos;                       /* points behind utf string           */
1440         char *utf_ptr;                      /* current position in utf text       */
1441         u2 pos = 0;                         /* position in c-string               */
1442
1443         if (!u) {
1444                 strcpy(buffer, "NULL");
1445                 return;
1446         }
1447
1448         endpos = UTF_END(u);
1449         utf_ptr = u->text;
1450
1451         while (utf_ptr < endpos) {
1452                 /* copy next unicode character */
1453                 u2 c = utf_nextu2(&utf_ptr);
1454                 if (c == '/') c = '.';
1455                 buffer[pos++] = c;
1456         }
1457
1458         /* terminate string */
1459         buffer[pos] = '\0';
1460 }
1461
1462
1463 /* utf_strcat_convert_to_latin1 ************************************************
1464
1465    Like libc strcat, but uses an utf8 string.
1466    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1467    invalid results.
1468
1469 *******************************************************************************/
1470
1471 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1472 {
1473         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1474 }
1475
1476
1477 /* utf_strcat_convert_to_latin1_classname **************************************
1478
1479    Like libc strcat, but uses an utf8 string.
1480    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1481    invalid results.
1482
1483 *******************************************************************************/
1484
1485 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1486 {
1487         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1488 }
1489
1490
1491 /* utf_fprint_printable_ascii **************************************************
1492
1493    Write utf symbol into file.
1494    Non-printable and non-ASCII characters are printed as '?'.
1495
1496 *******************************************************************************/
1497
1498 void utf_fprint_printable_ascii(FILE *file, utf *u)
1499 {
1500         char *endpos;                       /* points behind utf string           */
1501         char *utf_ptr;                      /* current position in utf text       */
1502
1503         if (!u)
1504                 return;
1505
1506         endpos = UTF_END(u);
1507         utf_ptr = u->text;
1508
1509         while (utf_ptr < endpos) {
1510                 /* read next unicode character */
1511                 u2 c = utf_nextu2(&utf_ptr);
1512
1513                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1514                 else fprintf(file, "?");
1515         }
1516 }
1517
1518
1519 /* utf_fprint_printable_ascii_classname ****************************************
1520
1521    Write utf symbol into file with `/' converted to `.'.
1522    Non-printable and non-ASCII characters are printed as '?'.
1523
1524 *******************************************************************************/
1525
1526 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1527 {
1528         char *endpos;                       /* points behind utf string           */
1529         char *utf_ptr;                      /* current position in utf text       */
1530
1531     if (!u)
1532                 return;
1533
1534         endpos = UTF_END(u);
1535         utf_ptr = u->text;
1536
1537         while (utf_ptr < endpos) {
1538                 /* read next unicode character */
1539                 u2 c = utf_nextu2(&utf_ptr);
1540                 if (c == '/') c = '.';
1541
1542                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1543                 else fprintf(file, "?");
1544         }
1545 }
1546
1547
1548 /* is_valid_utf ****************************************************************
1549
1550    Return true if the given string is a valid UTF-8 string.
1551
1552    utf_ptr...points to first character
1553    end_pos...points after last character
1554
1555 *******************************************************************************/
1556
1557 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1558
1559 bool is_valid_utf(char *utf_ptr, char *end_pos)
1560 {
1561         int bytes;
1562         int len,i;
1563         char c;
1564         unsigned long v;
1565
1566         if (end_pos < utf_ptr) return false;
1567         bytes = end_pos - utf_ptr;
1568         while (bytes--) {
1569                 c = *utf_ptr++;
1570
1571                 if (!c) return false;                     /* 0x00 is not allowed */
1572                 if ((c & 0x80) == 0) continue;            /* ASCII */
1573
1574                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1575                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1576                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1577                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1578                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1579                 else return false;                        /* invalid leading byte */
1580
1581                 if (len > 2) return false;                /* Java limitation */
1582
1583                 v = (unsigned long)c & (0x3f >> len);
1584
1585                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1586
1587                 for (i = len; i--; ) {
1588                         c = *utf_ptr++;
1589                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1590                                 return false;
1591                         v = (v << 6) | (c & 0x3f);
1592                 }
1593
1594                 if (v == 0) {
1595                         if (len != 1) return false;           /* Java special */
1596
1597                 } else {
1598                         /* Sun Java seems to allow overlong UTF-8 encodings */
1599
1600                         /* if (v < min_codepoint[len]) */
1601                                 /* XXX throw exception? */
1602                 }
1603
1604                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1605                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1606
1607                 /* even these seem to be allowed */
1608                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1609         }
1610
1611         return true;
1612 }
1613
1614
1615 /* is_valid_name ***************************************************************
1616
1617    Return true if the given string may be used as a class/field/method
1618    name. (Currently this only disallows empty strings and control
1619    characters.)
1620
1621    NOTE: The string is assumed to have passed is_valid_utf!
1622
1623    utf_ptr...points to first character
1624    end_pos...points after last character
1625
1626 *******************************************************************************/
1627
1628 bool is_valid_name(char *utf_ptr, char *end_pos)
1629 {
1630         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1631
1632         while (utf_ptr < end_pos) {
1633                 unsigned char c = *utf_ptr++;
1634
1635                 if (c < 0x20) return false; /* disallow control characters */
1636                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1637                         return false;
1638         }
1639
1640         return true;
1641 }
1642
1643 bool is_valid_name_utf(utf *u)
1644 {
1645         return is_valid_name(u->text, UTF_END(u));
1646 }
1647
1648
1649 /* utf_show ********************************************************************
1650
1651    Writes the utf symbols in the utfhash to stdout and displays the
1652    number of external hash chains grouped according to the chainlength
1653    (for debugging purposes).
1654
1655 *******************************************************************************/
1656
1657 #if !defined(NDEBUG)
1658 void utf_show(void)
1659 {
1660
1661 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1662
1663         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1664         u4 max_chainlength = 0;      /* maximum length of the chains */
1665         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1666         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1667         u4 i;
1668
1669         printf("UTF-HASH:\n");
1670
1671         /* show element of utf-hashtable */
1672
1673         for (i = 0; i < hashtable_utf->size; i++) {
1674                 utf *u = hashtable_utf->ptr[i];
1675
1676                 if (u) {
1677                         printf("SLOT %d: ", (int) i);
1678
1679                         while (u) {
1680                                 printf("'");
1681                                 utf_display_printable_ascii(u);
1682                                 printf("' ");
1683                                 u = u->hashlink;
1684                         }
1685                         printf("\n");
1686                 }
1687         }
1688
1689         printf("UTF-HASH: %d slots for %d entries\n",
1690                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1691
1692         if (hashtable_utf->entries == 0)
1693                 return;
1694
1695         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1696
1697         for (i=0;i<CHAIN_LIMIT;i++)
1698                 chain_count[i]=0;
1699
1700         /* count numbers of hashchains according to their length */
1701         for (i=0; i<hashtable_utf->size; i++) {
1702
1703                 utf *u = (utf*) hashtable_utf->ptr[i];
1704                 u4 chain_length = 0;
1705
1706                 /* determine chainlength */
1707                 while (u) {
1708                         u = u->hashlink;
1709                         chain_length++;
1710                 }
1711
1712                 /* update sum of all chainlengths */
1713                 sum_chainlength+=chain_length;
1714
1715                 /* determine the maximum length of the chains */
1716                 if (chain_length>max_chainlength)
1717                         max_chainlength = chain_length;
1718
1719                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1720                 if (chain_length>=CHAIN_LIMIT) {
1721                         beyond_limit+=chain_length;
1722                         chain_length=CHAIN_LIMIT-1;
1723                 }
1724
1725                 /* update number of hashchains of current length */
1726                 chain_count[chain_length]++;
1727         }
1728
1729         /* display results */
1730         for (i=1;i<CHAIN_LIMIT-1;i++)
1731                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1732
1733         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1734
1735
1736         printf("max. chainlength:%5d\n",max_chainlength);
1737
1738         /* avg. chainlength = sum of chainlengths / number of chains */
1739         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1740 }
1741 #endif /* !defined(NDEBUG) */
1742
1743
1744 /*
1745  * These are local overrides for various environment variables in Emacs.
1746  * Please do not remove this and leave it at the end of the file, where
1747  * Emacs will automagically detect them.
1748  * ---------------------------------------------------------------------
1749  * Local variables:
1750  * mode: c
1751  * indent-tabs-mode: t
1752  * c-basic-offset: 4
1753  * tab-width: 4
1754  * End:
1755  * vim:noexpandtab:sw=4:ts=4:
1756  */