src/vm/utf8.c

   1 /* src/vm/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    Contact: cacao@cacaojvm.org
  26
  27    Authors: Reinhard Grafl
  28             Mark Probst
  29             Andreas Krall
  30             Christian Thalinger
  31             Edwin Steiner
  32
  33    $Id: utf8.c 6244 2006-12-27 15:15:31Z twisti $
  34
  35 */
  36
  37
  38 #include "config.h"
  39
  40 #include <string.h>
  41 #include <assert.h>
  42
  43 #include "vm/types.h"
  44
  45 #include "mm/memory.h"
  46
  47 #if defined(ENABLE_THREADS)
  48 # include "threads/native/lock.h"
  49 #else
  50 # include "threads/none/lock.h"
  51 #endif
  52
  53 #include "vm/builtin.h"
  54 #include "vm/exceptions.h"
  55 #include "vm/hashtable.h"
  56 #include "vm/options.h"
  57 #include "vm/statistics.h"
  58 #include "vm/stringlocal.h"
  59 #include "vm/utf8.h"
  60
  61
  62 /* global variables ***********************************************************/
  63
  64 /* hashsize must be power of 2 */
  65
  66 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  67
  68 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  69
  70
  71 /* utf-symbols for pointer comparison of frequently used strings **************/
  72
  73 utf *utf_java_lang_Object;
  74
  75 utf *utf_java_lang_Class;
  76 utf *utf_java_lang_ClassLoader;
  77 utf *utf_java_lang_Cloneable;
  78 utf *utf_java_lang_SecurityManager;
  79 utf *utf_java_lang_String;
  80 utf *utf_java_lang_System;
  81 utf *utf_java_lang_ThreadGroup;
  82 utf *utf_java_io_Serializable;
  83
  84 utf *utf_java_lang_Throwable;
  85 utf *utf_java_lang_Error;
  86 utf *utf_java_lang_LinkageError;
  87 utf *utf_java_lang_NoClassDefFoundError;
  88 utf *utf_java_lang_OutOfMemoryError;
  89 utf *utf_java_lang_VirtualMachineError;
  90
  91 #if defined(ENABLE_JAVASE)
  92 utf *utf_java_lang_AbstractMethodError;
  93 utf *utf_java_lang_NoSuchMethodError;
  94 #endif
  95
  96 #if defined(WITH_CLASSPATH_GNU)
  97 utf *utf_java_lang_VMThrowable;
  98 #endif
  99
 100 utf *utf_java_lang_Exception;
 101 utf *utf_java_lang_ClassCastException;
 102 utf *utf_java_lang_ClassNotFoundException;
 103 utf *utf_java_lang_IllegalArgumentException;
 104 utf *utf_java_lang_IllegalMonitorStateException;
 105
 106 utf *utf_java_lang_NullPointerException;
 107
 108 #if defined(ENABLE_JAVASE)
 109 utf* utf_java_lang_Void;
 110 #endif
 111
 112 utf* utf_java_lang_Boolean;
 113 utf* utf_java_lang_Byte;
 114 utf* utf_java_lang_Character;
 115 utf* utf_java_lang_Short;
 116 utf* utf_java_lang_Integer;
 117 utf* utf_java_lang_Long;
 118 utf* utf_java_lang_Float;
 119 utf* utf_java_lang_Double;
 120
 121 #if defined(ENABLE_JAVASE)
 122 utf *utf_java_lang_StackTraceElement;
 123 utf *utf_java_lang_reflect_Constructor;
 124 utf *utf_java_lang_reflect_Field;
 125 utf *utf_java_lang_reflect_Method;
 126 utf *utf_java_util_Vector;
 127 #endif
 128
 129 utf *utf_InnerClasses;                  /* InnerClasses                       */
 130 utf *utf_ConstantValue;                 /* ConstantValue                      */
 131 utf *utf_Code;                          /* Code                               */
 132 utf *utf_Exceptions;                    /* Exceptions                         */
 133 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 134 utf *utf_SourceFile;                    /* SourceFile                         */
 135
 136 #if defined(ENABLE_JAVASE)
 137 utf *utf_EnclosingMethod;
 138 utf *utf_Signature;
 139 utf *utf_RuntimeVisibleAnnotations;
 140 utf *utf_StackMapTable;
 141 #endif
 142
 143 utf *utf_init;                          /* <init>                             */
 144 utf *utf_clinit;                        /* <clinit>                           */
 145 utf *utf_clone;                         /* clone                              */
 146 utf *utf_finalize;                      /* finalize                           */
 147 utf *utf_run;                           /* run                                */
 148
 149 utf *utf_add;
 150 utf *utf_remove;
 151 utf *utf_addThread;
 152 utf *utf_removeThread;
 153 utf *utf_put;
 154 utf *utf_get;
 155 utf *utf_value;
 156
 157 utf *utf_fillInStackTrace;
 158 utf *utf_getSystemClassLoader;
 159 utf *utf_loadClass;
 160 utf *utf_printStackTrace;
 161
 162 utf *utf_Z;                             /* Z                                  */
 163 utf *utf_B;                             /* B                                  */
 164 utf *utf_C;                             /* C                                  */
 165 utf *utf_S;                             /* S                                  */
 166 utf *utf_I;                             /* I                                  */
 167 utf *utf_J;                             /* J                                  */
 168 utf *utf_F;                             /* F                                  */
 169 utf *utf_D;                             /* D                                  */
 170
 171 utf *utf_void__void;                    /* ()V                                */
 172 utf *utf_boolean__void;                 /* (Z)V                               */
 173 utf *utf_byte__void;                    /* (B)V                               */
 174 utf *utf_char__void;                    /* (C)V                               */
 175 utf *utf_short__void;                   /* (S)V                               */
 176 utf *utf_int__void;                     /* (I)V                               */
 177 utf *utf_long__void;                    /* (J)V                               */
 178 utf *utf_float__void;                   /* (F)V                               */
 179 utf *utf_double__void;                  /* (D)V                               */
 180
 181 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 182 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 183 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 184 utf *utf_java_lang_Object__java_lang_Object;
 185 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 186 utf *utf_java_lang_String__java_lang_Class;
 187 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 188 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 189
 190 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 191 utf *utf_null;
 192 utf *array_packagename;
 193
 194
 195 /* utf_init ********************************************************************
 196
 197    Initializes the utf8 subsystem.
 198
 199 *******************************************************************************/
 200
 201 bool utf8_init(void)
 202 {
 203         /* create utf8 hashtable */
 204
 205         hashtable_utf = NEW(hashtable);
 206
 207         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 208
 209 #if defined(ENABLE_STATISTICS)
 210         if (opt_stat)
 211                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 212 #endif
 213
 214         /* create utf-symbols for pointer comparison of frequently used strings */
 215
 216         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 217
 218         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 219         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 220         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 221         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 222         utf_java_lang_String           = utf_new_char("java/lang/String");
 223         utf_java_lang_System           = utf_new_char("java/lang/System");
 224         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 225         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 226
 227         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 228         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 229
 230         utf_java_lang_LinkageError =
 231                 utf_new_char(string_java_lang_LinkageError);
 232
 233         utf_java_lang_NoClassDefFoundError =
 234                 utf_new_char(string_java_lang_NoClassDefFoundError);
 235
 236         utf_java_lang_OutOfMemoryError =
 237                 utf_new_char(string_java_lang_OutOfMemoryError);
 238
 239         utf_java_lang_VirtualMachineError =
 240                 utf_new_char(string_java_lang_VirtualMachineError);
 241
 242 #if defined(ENABLE_JAVASE)
 243         utf_java_lang_AbstractMethodError =
 244                 utf_new_char(string_java_lang_AbstractMethodError);
 245
 246         utf_java_lang_NoSuchMethodError =
 247                 utf_new_char(string_java_lang_NoSuchMethodError);
 248 #endif
 249
 250 #if defined(WITH_CLASSPATH_GNU)
 251         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 252 #endif
 253
 254         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 255
 256         utf_java_lang_ClassCastException =
 257                 utf_new_char(string_java_lang_ClassCastException);
 258
 259         utf_java_lang_ClassNotFoundException =
 260                 utf_new_char(string_java_lang_ClassNotFoundException);
 261
 262         utf_java_lang_IllegalArgumentException =
 263                 utf_new_char(string_java_lang_IllegalArgumentException);
 264
 265         utf_java_lang_IllegalMonitorStateException =
 266                 utf_new_char(string_java_lang_IllegalMonitorStateException);
 267
 268         utf_java_lang_NullPointerException =
 269                 utf_new_char(string_java_lang_NullPointerException);
 270
 271 #if defined(ENABLE_JAVASE)
 272         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 273 #endif
 274
 275         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 276         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 277         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 278         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 279         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 280         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 281         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 282         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 283
 284 #if defined(ENABLE_JAVASE)
 285         utf_java_lang_StackTraceElement =
 286                 utf_new_char("java/lang/StackTraceElement");
 287
 288         utf_java_lang_reflect_Constructor =
 289                 utf_new_char("java/lang/reflect/Constructor");
 290
 291         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 292         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 293         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 294 #endif
 295
 296         utf_InnerClasses               = utf_new_char("InnerClasses");
 297         utf_ConstantValue              = utf_new_char("ConstantValue");
 298         utf_Code                       = utf_new_char("Code");
 299         utf_Exceptions                 = utf_new_char("Exceptions");
 300         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 301         utf_SourceFile                 = utf_new_char("SourceFile");
 302
 303 #if defined(ENABLE_JAVASE)
 304         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 305         utf_Signature                  = utf_new_char("Signature");
 306         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
 307         utf_StackMapTable              = utf_new_char("StackMapTable");
 308 #endif
 309
 310         utf_init                           = utf_new_char("<init>");
 311         utf_clinit                         = utf_new_char("<clinit>");
 312         utf_clone                      = utf_new_char("clone");
 313         utf_finalize                   = utf_new_char("finalize");
 314         utf_run                        = utf_new_char("run");
 315
 316         utf_add                        = utf_new_char("add");
 317         utf_remove                     = utf_new_char("remove");
 318         utf_addThread                  = utf_new_char("addThread");
 319         utf_removeThread               = utf_new_char("removeThread");
 320         utf_put                        = utf_new_char("put");
 321         utf_get                        = utf_new_char("get");
 322         utf_value                      = utf_new_char("value");
 323
 324         utf_printStackTrace            = utf_new_char("printStackTrace");
 325         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 326         utf_loadClass                  = utf_new_char("loadClass");
 327         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 328
 329         utf_Z                          = utf_new_char("Z");
 330         utf_B                          = utf_new_char("B");
 331         utf_C                          = utf_new_char("C");
 332         utf_S                          = utf_new_char("S");
 333         utf_I                          = utf_new_char("I");
 334         utf_J                          = utf_new_char("J");
 335         utf_F                          = utf_new_char("F");
 336         utf_D                          = utf_new_char("D");
 337
 338         utf_void__void                 = utf_new_char("()V");
 339         utf_boolean__void              = utf_new_char("(Z)V");
 340         utf_byte__void                 = utf_new_char("(B)V");
 341         utf_char__void                 = utf_new_char("(C)V");
 342         utf_short__void                = utf_new_char("(S)V");
 343         utf_int__void                  = utf_new_char("(I)V");
 344         utf_long__void                 = utf_new_char("(J)V");
 345         utf_float__void                = utf_new_char("(F)V");
 346         utf_double__void               = utf_new_char("(D)V");
 347         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 348         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 349
 350         utf_void__java_lang_ClassLoader =
 351                 utf_new_char("()Ljava/lang/ClassLoader;");
 352
 353         utf_java_lang_Object__java_lang_Object =
 354                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 355
 356         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 357
 358         utf_java_lang_String__java_lang_Class =
 359                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 360
 361         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 362         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 363
 364         utf_null                       = utf_new_char("null");
 365         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 366         array_packagename              = utf_new_char("\t<the array package>");
 367
 368         /* everything's ok */
 369
 370         return true;
 371 }
 372
 373
 374 /* utf_hashkey *****************************************************************
 375
 376    The hashkey is computed from the utf-text by using up to 8
 377    characters.  For utf-symbols longer than 15 characters 3 characters
 378    are taken from the beginning and the end, 2 characters are taken
 379    from the middle.
 380
 381 *******************************************************************************/
 382
 383 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 384 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 385
 386 u4 utf_hashkey(const char *text, u4 length)
 387 {
 388         const char *start_pos = text;       /* pointer to utf text                */
 389         u4 a;
 390
 391         switch (length) {
 392         case 0: /* empty string */
 393                 return 0;
 394
 395         case 1: return fbs(0);
 396         case 2: return fbs(0) ^ nbs(3);
 397         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 398         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 399         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 400         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 401         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 402         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 403
 404         case 9:
 405                 a = fbs(0);
 406                 a ^= nbs(1);
 407                 a ^= nbs(2);
 408                 text++;
 409                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 410
 411         case 10:
 412                 a = fbs(0);
 413                 text++;
 414                 a ^= nbs(2);
 415                 a ^= nbs(3);
 416                 a ^= nbs(4);
 417                 text++;
 418                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 419
 420         case 11:
 421                 a = fbs(0);
 422                 text++;
 423                 a ^= nbs(2);
 424                 a ^= nbs(3);
 425                 a ^= nbs(4);
 426                 text++;
 427                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 428
 429         case 12:
 430                 a = fbs(0);
 431                 text += 2;
 432                 a ^= nbs(2);
 433                 a ^= nbs(3);
 434                 text++;
 435                 a ^= nbs(5);
 436                 a ^= nbs(6);
 437                 a ^= nbs(7);
 438                 text++;
 439                 return a ^ nbs(9) ^ nbs(10);
 440
 441         case 13:
 442                 a = fbs(0);
 443                 a ^= nbs(1);
 444                 text++;
 445                 a ^= nbs(3);
 446                 a ^= nbs(4);
 447                 text += 2;
 448                 a ^= nbs(7);
 449                 a ^= nbs(8);
 450                 text += 2;
 451                 return a ^ nbs(9) ^ nbs(10);
 452
 453         case 14:
 454                 a = fbs(0);
 455                 text += 2;
 456                 a ^= nbs(3);
 457                 a ^= nbs(4);
 458                 text += 2;
 459                 a ^= nbs(7);
 460                 a ^= nbs(8);
 461                 text += 2;
 462                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 463
 464         case 15:
 465                 a = fbs(0);
 466                 text += 2;
 467                 a ^= nbs(3);
 468                 a ^= nbs(4);
 469                 text += 2;
 470                 a ^= nbs(7);
 471                 a ^= nbs(8);
 472                 text += 2;
 473                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 474
 475         default:  /* 3 characters from beginning */
 476                 a = fbs(0);
 477                 text += 2;
 478                 a ^= nbs(3);
 479                 a ^= nbs(4);
 480
 481                 /* 2 characters from middle */
 482                 text = start_pos + (length / 2);
 483                 a ^= fbs(5);
 484                 text += 2;
 485                 a ^= nbs(6);
 486
 487                 /* 3 characters from end */
 488                 text = start_pos + length - 4;
 489
 490                 a ^= fbs(7);
 491                 text++;
 492
 493                 return a ^ nbs(10) ^ nbs(11);
 494     }
 495 }
 496
 497 /* utf_full_hashkey ************************************************************
 498
 499    This function computes a hash value using all bytes in the string.
 500
 501    The algorithm is the "One-at-a-time" algorithm as published
 502    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 503
 504 *******************************************************************************/
 505
 506 u4 utf_full_hashkey(const char *text, u4 length)
 507 {
 508         register const unsigned char *p = (const unsigned char *) text;
 509         register u4 hash;
 510         register u4 i;
 511
 512         hash = 0;
 513         for (i=length; i--;)
 514         {
 515             hash += *p++;
 516             hash += (hash << 10);
 517             hash ^= (hash >> 6);
 518         }
 519         hash += (hash << 3);
 520         hash ^= (hash >> 11);
 521         hash += (hash << 15);
 522
 523         return hash;
 524 }
 525
 526 /* unicode_hashkey *************************************************************
 527
 528    Compute the hashkey of a unicode string.
 529
 530 *******************************************************************************/
 531
 532 u4 unicode_hashkey(u2 *text, u2 len)
 533 {
 534         return utf_hashkey((char *) text, len);
 535 }
 536
 537
 538 /* utf_new *********************************************************************
 539
 540    Creates a new utf-symbol, the text of the symbol is passed as a
 541    u1-array. The function searches the utf-hashtable for a utf-symbol
 542    with this text. On success the element returned, otherwise a new
 543    hashtable element is created.
 544
 545    If the number of entries in the hashtable exceeds twice the size of
 546    the hashtable slots a reorganization of the hashtable is done and
 547    the utf symbols are copied to a new hashtable with doubled size.
 548
 549 *******************************************************************************/
 550
 551 utf *utf_new(const char *text, u2 length)
 552 {
 553         u4 key;                             /* hashkey computed from utf-text     */
 554         u4 slot;                            /* slot in hashtable                  */
 555         utf *u;                             /* hashtable element                  */
 556         u2 i;
 557
 558         LOCK_MONITOR_ENTER(hashtable_utf->header);
 559
 560 #if defined(ENABLE_STATISTICS)
 561         if (opt_stat)
 562                 count_utf_new++;
 563 #endif
 564
 565         key  = utf_hashkey(text, length);
 566         slot = key & (hashtable_utf->size - 1);
 567         u    = hashtable_utf->ptr[slot];
 568
 569         /* search external hash chain for utf-symbol */
 570
 571         while (u) {
 572                 if (u->blength == length) {
 573                         /* compare text of hashtable elements */
 574
 575                         for (i = 0; i < length; i++)
 576                                 if (text[i] != u->text[i])
 577                                         goto nomatch;
 578
 579 #if defined(ENABLE_STATISTICS)
 580                         if (opt_stat)
 581                                 count_utf_new_found++;
 582 #endif
 583
 584                         /* symbol found in hashtable */
 585
 586                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 587
 588                         return u;
 589                 }
 590
 591         nomatch:
 592                 u = u->hashlink; /* next element in external chain */
 593         }
 594
 595 #if defined(ENABLE_STATISTICS)
 596         if (opt_stat)
 597                 count_utf_len += sizeof(utf) + length + 1;
 598 #endif
 599
 600         /* location in hashtable found, create new utf element */
 601         u = NEW(utf);
 602         u->blength  = length;               /* length in bytes of utfstring       */
 603         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 604         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 605
 606         memcpy(u->text, text, length);      /* copy utf-text                      */
 607         u->text[length] = '\0';
 608
 609         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 610         hashtable_utf->entries++;           /* update number of entries           */
 611
 612         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 613
 614         /* reorganization of hashtable, average length of the external
 615            chains is approx. 2 */
 616
 617                 hashtable *newhash;                              /* the new hashtable */
 618                 u4         i;
 619                 utf       *u;
 620                 utf       *nextu;
 621                 u4         slot;
 622
 623                 /* create new hashtable, double the size */
 624
 625                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 626
 627 #if defined(ENABLE_STATISTICS)
 628                 if (opt_stat)
 629                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 630 #endif
 631
 632                 /* transfer elements to new hashtable */
 633
 634                 for (i = 0; i < hashtable_utf->size; i++) {
 635                         u = hashtable_utf->ptr[i];
 636
 637                         while (u) {
 638                                 nextu = u->hashlink;
 639                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 640
 641                                 u->hashlink = (utf *) newhash->ptr[slot];
 642                                 newhash->ptr[slot] = u;
 643
 644                                 /* follow link in external hash chain */
 645
 646                                 u = nextu;
 647                         }
 648                 }
 649
 650                 /* dispose old table */
 651
 652                 hashtable_free(hashtable_utf);
 653
 654                 hashtable_utf = newhash;
 655         }
 656
 657         LOCK_MONITOR_EXIT(hashtable_utf->header);
 658
 659         return u;
 660 }
 661
 662
 663 /* utf_new_u2 ******************************************************************
 664
 665    Make utf symbol from u2 array, if isclassname is true '.' is
 666    replaced by '/'.
 667
 668 *******************************************************************************/
 669
 670 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 671 {
 672         char *buffer;                   /* memory buffer for  unicode characters  */
 673         char *pos;                      /* pointer to current position in buffer  */
 674         u4 left;                        /* unicode characters left                */
 675         u4 buflength;                   /* utf length in bytes of the u2 array    */
 676         utf *result;                    /* resulting utf-string                   */
 677         int i;
 678
 679         /* determine utf length in bytes and allocate memory */
 680
 681         buflength = u2_utflength(unicode_pos, unicode_length);
 682         buffer    = MNEW(char, buflength);
 683
 684         left = buflength;
 685         pos  = buffer;
 686
 687         for (i = 0; i++ < unicode_length; unicode_pos++) {
 688                 /* next unicode character */
 689                 u2 c = *unicode_pos;
 690
 691                 if ((c != 0) && (c < 0x80)) {
 692                         /* 1 character */
 693                         left--;
 694                 if ((int) left < 0) break;
 695                         /* convert classname */
 696                         if (isclassname && c == '.')
 697                                 *pos++ = '/';
 698                         else
 699                                 *pos++ = (char) c;
 700
 701                 } else if (c < 0x800) {
 702                         /* 2 characters */
 703                 unsigned char high = c >> 6;
 704                 unsigned char low  = c & 0x3F;
 705                         left = left - 2;
 706                 if ((int) left < 0) break;
 707                 *pos++ = high | 0xC0;
 708                 *pos++ = low  | 0x80;
 709
 710                 } else {
 711                 /* 3 characters */
 712                 char low  = c & 0x3f;
 713                 char mid  = (c >> 6) & 0x3F;
 714                 char high = c >> 12;
 715                         left = left - 3;
 716                 if ((int) left < 0) break;
 717                 *pos++ = high | 0xE0;
 718                 *pos++ = mid  | 0x80;
 719                 *pos++ = low  | 0x80;
 720                 }
 721         }
 722
 723         /* insert utf-string into symbol-table */
 724         result = utf_new(buffer,buflength);
 725
 726         MFREE(buffer, char, buflength);
 727
 728         return result;
 729 }
 730
 731
 732 /* utf_new_char ****************************************************************
 733
 734    Creates a new utf symbol, the text for this symbol is passed as a
 735    c-string ( = char* ).
 736
 737 *******************************************************************************/
 738
 739 utf *utf_new_char(const char *text)
 740 {
 741         return utf_new(text, strlen(text));
 742 }
 743
 744
 745 /* utf_new_char_classname ******************************************************
 746
 747    Creates a new utf symbol, the text for this symbol is passed as a
 748    c-string ( = char* ) "." characters are going to be replaced by
 749    "/". Since the above function is used often, this is a separte
 750    function, instead of an if.
 751
 752 *******************************************************************************/
 753
 754 utf *utf_new_char_classname(const char *text)
 755 {
 756         if (strchr(text, '.')) {
 757                 char *txt = strdup(text);
 758                 char *end = txt + strlen(txt);
 759                 char *c;
 760                 utf *tmpRes;
 761
 762                 for (c = txt; c < end; c++)
 763                         if (*c == '.') *c = '/';
 764
 765                 tmpRes = utf_new(txt, strlen(txt));
 766                 FREE(txt, 0);
 767
 768                 return tmpRes;
 769
 770         } else
 771                 return utf_new(text, strlen(text));
 772 }
 773
 774
 775 /* utf_nextu2 ******************************************************************
 776
 777    Read the next unicode character from the utf string and increment
 778    the utf-string pointer accordingly.
 779
 780    CAUTION: This function is unsafe for input that was not checked
 781             by is_valid_utf!
 782
 783 *******************************************************************************/
 784
 785 u2 utf_nextu2(char **utf_ptr)
 786 {
 787     /* uncompressed unicode character */
 788     u2 unicode_char = 0;
 789     /* current position in utf text */
 790     unsigned char *utf = (unsigned char *) (*utf_ptr);
 791     /* bytes representing the unicode character */
 792     unsigned char ch1, ch2, ch3;
 793     /* number of bytes used to represent the unicode character */
 794     int len = 0;
 795
 796     switch ((ch1 = utf[0]) >> 4) {
 797         default: /* 1 byte */
 798                 (*utf_ptr)++;
 799                 return (u2) ch1;
 800         case 0xC:
 801         case 0xD: /* 2 bytes */
 802                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 803                         unsigned char high = ch1 & 0x1F;
 804                         unsigned char low  = ch2 & 0x3F;
 805                         unicode_char = (high << 6) + low;
 806                         len = 2;
 807                 }
 808                 break;
 809
 810         case 0xE: /* 2 or 3 bytes */
 811                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 812                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 813                                 unsigned char low  = ch3 & 0x3f;
 814                                 unsigned char mid  = ch2 & 0x3f;
 815                                 unsigned char high = ch1 & 0x0f;
 816                                 unicode_char = (((high << 6) + mid) << 6) + low;
 817                                 len = 3;
 818                         } else
 819                                 len = 2;
 820                 }
 821                 break;
 822     }
 823
 824     /* update position in utf-text */
 825     *utf_ptr = (char *) (utf + len);
 826
 827     return unicode_char;
 828 }
 829
 830
 831 /* utf_bytes *******************************************************************
 832
 833    Determine number of bytes (aka. octets) in the utf string.
 834
 835    IN:
 836       u............utf string
 837
 838    OUT:
 839       The number of octets of this utf string.
 840           There is _no_ terminating zero included in this count.
 841
 842 *******************************************************************************/
 843
 844 u4 utf_bytes(utf *u)
 845 {
 846         return u->blength;
 847 }
 848
 849 /* utf_get_number_of_u2s_for_buffer ********************************************
 850
 851    Determine number of UTF-16 u2s in the given UTF-8 buffer
 852
 853    CAUTION: This function is unsafe for input that was not checked
 854             by is_valid_utf!
 855
 856    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 857    to an array of u2s (UTF-16) and want to know how many of them you will get.
 858    All other uses of this function are probably wrong.
 859
 860    IN:
 861       buffer........points to first char in buffer
 862           blength.......number of _bytes_ in the buffer
 863
 864    OUT:
 865       the number of u2s needed to hold this string in UTF-16 encoding.
 866           There is _no_ terminating zero included in this count.
 867
 868    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 869    exception.
 870
 871 *******************************************************************************/
 872
 873 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 874 {
 875         const char *endpos;                 /* points behind utf string           */
 876         const char *utf_ptr;                /* current position in utf text       */
 877         u4 len = 0;                         /* number of unicode characters       */
 878
 879         utf_ptr = buffer;
 880         endpos = utf_ptr + blength;
 881
 882         while (utf_ptr < endpos) {
 883                 len++;
 884                 /* next unicode character */
 885                 utf_nextu2((char **)&utf_ptr);
 886         }
 887
 888         assert(utf_ptr == endpos);
 889
 890         return len;
 891 }
 892
 893
 894 /* utf_get_number_of_u2s *******************************************************
 895
 896    Determine number of UTF-16 u2s in the utf string.
 897
 898    CAUTION: This function is unsafe for input that was not checked
 899             by is_valid_utf!
 900
 901    CAUTION: Use this function *only* when you want to convert a utf string
 902    to an array of u2s and want to know how many of them you will get.
 903    All other uses of this function are probably wrong.
 904
 905    IN:
 906       u............utf string
 907
 908    OUT:
 909       the number of u2s needed to hold this string in UTF-16 encoding.
 910           There is _no_ terminating zero included in this count.
 911           XXX 0 if a NullPointerException has been thrown (see below)
 912
 913 *******************************************************************************/
 914
 915 u4 utf_get_number_of_u2s(utf *u)
 916 {
 917         char *endpos;                       /* points behind utf string           */
 918         char *utf_ptr;                      /* current position in utf text       */
 919         u4 len = 0;                         /* number of unicode characters       */
 920
 921         /* XXX this is probably not checked by most callers! Review this after */
 922         /* the invalid uses of this function have been eliminated */
 923         if (!u) {
 924                 exceptions_throw_nullpointerexception();
 925                 return 0;
 926         }
 927
 928         endpos = UTF_END(u);
 929         utf_ptr = u->text;
 930
 931         while (utf_ptr < endpos) {
 932                 len++;
 933                 /* next unicode character */
 934                 utf_nextu2(&utf_ptr);
 935         }
 936
 937         if (utf_ptr != endpos)
 938                 /* string ended abruptly */
 939                 throw_cacao_exception_exit(string_java_lang_InternalError,
 940                                                                    "Illegal utf8 string");
 941
 942         return len;
 943 }
 944
 945
 946 /* utf8_safe_number_of_u2s *****************************************************
 947
 948    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
 949    (For invalid UTF-8 the U+fffd replacement character will be counted.)
 950
 951    This function is safe even for invalid UTF-8 strings.
 952
 953    IN:
 954       text..........zero-terminated(!) UTF-8 string (may be invalid)
 955                         must NOT be NULL
 956           nbytes........strlen(text). (This is needed to completely emulate
 957                         the RI).
 958
 959    OUT:
 960       the number of u2s needed to hold this string in UTF-16 encoding.
 961           There is _no_ terminating zero included in this count.
 962
 963 *******************************************************************************/
 964
 965 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
 966         register const unsigned char *t;
 967         register s4 byte;
 968         register s4 len;
 969         register const unsigned char *tlimit;
 970         s4 byte1;
 971         s4 byte2;
 972         s4 byte3;
 973         s4 value;
 974         s4 skip;
 975
 976         assert(text);
 977         assert(nbytes >= 0);
 978
 979         len = 0;
 980         t = (const unsigned char *) text;
 981         tlimit = t + nbytes;
 982
 983         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
 984
 985         while (1) {
 986                 byte = *t++;
 987
 988                 if (byte & 0x80) {
 989                         /* highest bit set, non-ASCII character */
 990
 991                         if ((byte & 0xe0) == 0xc0) {
 992                                 /* 2-byte: should be 110..... 10...... ? */
 993
 994                                 if ((*t++ & 0xc0) == 0x80)
 995                                         ; /* valid 2-byte */
 996                                 else
 997                                         t--; /* invalid */
 998                         }
 999                         else if ((byte & 0xf0) == 0xe0) {
1000                                 /* 3-byte: should be 1110.... 10...... 10...... */
1001                                 /*                            ^t                */
1002
1003                                 if (t + 2 > tlimit)
1004                                         return len + 1; /* invalid, stop here */
1005
1006                                 if ((*t++ & 0xc0) == 0x80) {
1007                                         if ((*t++ & 0xc0) == 0x80)
1008                                                 ; /* valid 3-byte */
1009                                         else
1010                                                 t--; /* invalid */
1011                                 }
1012                                 else
1013                                         t--; /* invalid */
1014                         }
1015                         else if ((byte & 0xf8) == 0xf0) {
1016                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1017                                 /*                            ^t                         */
1018
1019                                 if (t + 3 > tlimit)
1020                                         return len + 1; /* invalid, stop here */
1021
1022                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1023                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1024                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1025                                                         /* valid 4-byte UTF-8? */
1026                                                         value = ((byte  & 0x07) << 18)
1027                                                                   | ((byte1 & 0x3f) << 12)
1028                                                                   | ((byte2 & 0x3f) <<  6)
1029                                                                   | ((byte3 & 0x3f)      );
1030
1031                                                         if (value > 0x10FFFF)
1032                                                                 ; /* invalid */
1033                                                         else if (value > 0xFFFF)
1034                                                                 len += 1; /* we need surrogates */
1035                                                         else
1036                                                                 ; /* 16bit suffice */
1037                                                 }
1038                                                 else
1039                                                         t--; /* invalid */
1040                                         }
1041                                         else
1042                                                 t--; /* invalid */
1043                                 }
1044                                 else
1045                                         t--; /* invalid */
1046                         }
1047                         else if ((byte & 0xfc) == 0xf8) {
1048                                 /* invalid 5-byte */
1049                                 if (t + 4 > tlimit)
1050                                         return len + 1; /* invalid, stop here */
1051
1052                                 skip = 4;
1053                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1054                                         t++;
1055                         }
1056                         else if ((byte & 0xfe) == 0xfc) {
1057                                 /* invalid 6-byte */
1058                                 if (t + 5 > tlimit)
1059                                         return len + 1; /* invalid, stop here */
1060
1061                                 skip = 5;
1062                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1063                                         t++;
1064                         }
1065                         else
1066                                 ; /* invalid */
1067                 }
1068                 else {
1069                         /* NUL */
1070
1071                         if (byte == 0)
1072                                 break;
1073
1074                         /* ASCII character, common case */
1075                 }
1076
1077                 len++;
1078         }
1079
1080         return len;
1081 }
1082
1083
1084 /* utf8_safe_convert_to_u2s ****************************************************
1085
1086    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1087    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1088    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1089
1090    This function is safe even for invalid UTF-8 strings.
1091
1092    IN:
1093       text..........zero-terminated(!) UTF-8 string (may be invalid)
1094                         must NOT be NULL
1095           nbytes........strlen(text). (This is needed to completely emulate
1096                                         the RI).
1097           buffer........a preallocated array of u2s to receive the decoded
1098                         string. Use utf8_safe_number_of_u2s to get the
1099                                         required number of u2s for allocating this.
1100
1101 *******************************************************************************/
1102
1103 #define UNICODE_REPLACEMENT  0xfffd
1104
1105 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1106         register const unsigned char *t;
1107         register s4 byte;
1108         register const unsigned char *tlimit;
1109         s4 byte1;
1110         s4 byte2;
1111         s4 byte3;
1112         s4 value;
1113         s4 skip;
1114
1115         assert(text);
1116         assert(nbytes >= 0);
1117
1118         t = (const unsigned char *) text;
1119         tlimit = t + nbytes;
1120
1121         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1122
1123         while (1) {
1124                 byte = *t++;
1125
1126                 if (byte & 0x80) {
1127                         /* highest bit set, non-ASCII character */
1128
1129                         if ((byte & 0xe0) == 0xc0) {
1130                                 /* 2-byte: should be 110..... 10...... */
1131
1132                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1133                                         /* valid 2-byte UTF-8 */
1134                                         *buffer++ = ((byte  & 0x1f) << 6)
1135                                                           | ((byte1 & 0x3f)     );
1136                                 }
1137                                 else {
1138                                         *buffer++ = UNICODE_REPLACEMENT;
1139                                         t--;
1140                                 }
1141                         }
1142                         else if ((byte & 0xf0) == 0xe0) {
1143                                 /* 3-byte: should be 1110.... 10...... 10...... */
1144
1145                                 if (t + 2 > tlimit) {
1146                                         *buffer++ = UNICODE_REPLACEMENT;
1147                                         return;
1148                                 }
1149
1150                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1151                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1152                                                 /* valid 3-byte UTF-8 */
1153                                                 *buffer++ = ((byte  & 0x0f) << 12)
1154                                                                   | ((byte1 & 0x3f) <<  6)
1155                                                                   | ((byte2 & 0x3f)      );
1156                                         }
1157                                         else {
1158                                                 *buffer++ = UNICODE_REPLACEMENT;
1159                                                 t--;
1160                                         }
1161                                 }
1162                                 else {
1163                                         *buffer++ = UNICODE_REPLACEMENT;
1164                                         t--;
1165                                 }
1166                         }
1167                         else if ((byte & 0xf8) == 0xf0) {
1168                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1169
1170                                 if (t + 3 > tlimit) {
1171                                         *buffer++ = UNICODE_REPLACEMENT;
1172                                         return;
1173                                 }
1174
1175                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1176                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1177                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1178                                                         /* valid 4-byte UTF-8? */
1179                                                         value = ((byte  & 0x07) << 18)
1180                                                                   | ((byte1 & 0x3f) << 12)
1181                                                                   | ((byte2 & 0x3f) <<  6)
1182                                                                   | ((byte3 & 0x3f)      );
1183
1184                                                         if (value > 0x10FFFF) {
1185                                                                 *buffer++ = UNICODE_REPLACEMENT;
1186                                                         }
1187                                                         else if (value > 0xFFFF) {
1188                                                                 /* we need surrogates */
1189                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1190                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1191                                                         }
1192                                                         else
1193                                                                 *buffer++ = value; /* 16bit suffice */
1194                                                 }
1195                                                 else {
1196                                                         *buffer++ = UNICODE_REPLACEMENT;
1197                                                         t--;
1198                                                 }
1199                                         }
1200                                         else {
1201                                                 *buffer++ = UNICODE_REPLACEMENT;
1202                                                 t--;
1203                                         }
1204                                 }
1205                                 else {
1206                                         *buffer++ = UNICODE_REPLACEMENT;
1207                                         t--;
1208                                 }
1209                         }
1210                         else if ((byte & 0xfc) == 0xf8) {
1211                                 if (t + 4 > tlimit) {
1212                                         *buffer++ = UNICODE_REPLACEMENT;
1213                                         return;
1214                                 }
1215
1216                                 skip = 4;
1217                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1218                                         t++;
1219                                 *buffer++ = UNICODE_REPLACEMENT;
1220                         }
1221                         else if ((byte & 0xfe) == 0xfc) {
1222                                 if (t + 5 > tlimit) {
1223                                         *buffer++ = UNICODE_REPLACEMENT;
1224                                         return;
1225                                 }
1226
1227                                 skip = 5;
1228                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1229                                         t++;
1230                                 *buffer++ = UNICODE_REPLACEMENT;
1231                         }
1232                         else
1233                                 *buffer++ = UNICODE_REPLACEMENT;
1234                 }
1235                 else {
1236                         /* NUL */
1237
1238                         if (byte == 0)
1239                                 break;
1240
1241                         /* ASCII character, common case */
1242
1243                         *buffer++ = byte;
1244                 }
1245         }
1246 }
1247
1248
1249 /* u2_utflength ****************************************************************
1250
1251    Returns the utf length in bytes of a u2 array.
1252
1253 *******************************************************************************/
1254
1255 u4 u2_utflength(u2 *text, u4 u2_length)
1256 {
1257         u4 result_len = 0;                  /* utf length in bytes                */
1258         u2 ch;                              /* current unicode character          */
1259         u4 len;
1260
1261         for (len = 0; len < u2_length; len++) {
1262                 /* next unicode character */
1263                 ch = *text++;
1264
1265                 /* determine bytes required to store unicode character as utf */
1266                 if (ch && (ch < 0x80))
1267                         result_len++;
1268                 else if (ch < 0x800)
1269                         result_len += 2;
1270                 else
1271                         result_len += 3;
1272         }
1273
1274     return result_len;
1275 }
1276
1277
1278 /* utf_copy ********************************************************************
1279
1280    Copy the given utf string byte-for-byte to a buffer.
1281
1282    IN:
1283       buffer.......the buffer
1284           u............the utf string
1285
1286 *******************************************************************************/
1287
1288 void utf_copy(char *buffer, utf *u)
1289 {
1290         /* our utf strings are zero-terminated (done by utf_new) */
1291         MCOPY(buffer, u->text, char, u->blength + 1);
1292 }
1293
1294
1295 /* utf_cat *********************************************************************
1296
1297    Append the given utf string byte-for-byte to a buffer.
1298
1299    IN:
1300       buffer.......the buffer
1301           u............the utf string
1302
1303 *******************************************************************************/
1304
1305 void utf_cat(char *buffer, utf *u)
1306 {
1307         /* our utf strings are zero-terminated (done by utf_new) */
1308         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1309 }
1310
1311
1312 /* utf_copy_classname **********************************************************
1313
1314    Copy the given utf classname byte-for-byte to a buffer.
1315    '/' is replaced by '.'
1316
1317    IN:
1318       buffer.......the buffer
1319           u............the utf string
1320
1321 *******************************************************************************/
1322
1323 void utf_copy_classname(char *buffer, utf *u)
1324 {
1325         char *bufptr;
1326         char *srcptr;
1327         char *endptr;
1328         char ch;
1329
1330         bufptr = buffer;
1331         srcptr = u->text;
1332         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1333
1334         while (srcptr != endptr) {
1335                 ch = *srcptr++;
1336                 if (ch == '/')
1337                         ch = '.';
1338                 *bufptr++ = ch;
1339         }
1340 }
1341
1342
1343 /* utf_cat *********************************************************************
1344
1345    Append the given utf classname byte-for-byte to a buffer.
1346    '/' is replaced by '.'
1347
1348    IN:
1349       buffer.......the buffer
1350           u............the utf string
1351
1352 *******************************************************************************/
1353
1354 void utf_cat_classname(char *buffer, utf *u)
1355 {
1356         utf_copy_classname(buffer + strlen(buffer), u);
1357 }
1358
1359 /* utf_display_printable_ascii *************************************************
1360
1361    Write utf symbol to stdout (for debugging purposes).
1362    Non-printable and non-ASCII characters are printed as '?'.
1363
1364 *******************************************************************************/
1365
1366 void utf_display_printable_ascii(utf *u)
1367 {
1368         char *endpos;                       /* points behind utf string           */
1369         char *utf_ptr;                      /* current position in utf text       */
1370
1371         if (u == NULL) {
1372                 printf("NULL");
1373                 fflush(stdout);
1374                 return;
1375         }
1376
1377         endpos = UTF_END(u);
1378         utf_ptr = u->text;
1379
1380         while (utf_ptr < endpos) {
1381                 /* read next unicode character */
1382
1383                 u2 c = utf_nextu2(&utf_ptr);
1384
1385                 if ((c >= 32) && (c <= 127))
1386                         printf("%c", c);
1387                 else
1388                         printf("?");
1389         }
1390
1391         fflush(stdout);
1392 }
1393
1394
1395 /* utf_display_printable_ascii_classname ***************************************
1396
1397    Write utf symbol to stdout with `/' converted to `.' (for debugging
1398    purposes).
1399    Non-printable and non-ASCII characters are printed as '?'.
1400
1401 *******************************************************************************/
1402
1403 void utf_display_printable_ascii_classname(utf *u)
1404 {
1405         char *endpos;                       /* points behind utf string           */
1406         char *utf_ptr;                      /* current position in utf text       */
1407
1408         if (u == NULL) {
1409                 printf("NULL");
1410                 fflush(stdout);
1411                 return;
1412         }
1413
1414         endpos = UTF_END(u);
1415         utf_ptr = u->text;
1416
1417         while (utf_ptr < endpos) {
1418                 /* read next unicode character */
1419
1420                 u2 c = utf_nextu2(&utf_ptr);
1421
1422                 if (c == '/')
1423                         c = '.';
1424
1425                 if ((c >= 32) && (c <= 127))
1426                         printf("%c", c);
1427                 else
1428                         printf("?");
1429         }
1430
1431         fflush(stdout);
1432 }
1433
1434
1435 /* utf_sprint_convert_to_latin1 ************************************************
1436
1437    Write utf symbol into c-string (for debugging purposes).
1438    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1439    invalid results.
1440
1441 *******************************************************************************/
1442
1443 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1444 {
1445         char *endpos;                       /* points behind utf string           */
1446         char *utf_ptr;                      /* current position in utf text       */
1447         u2 pos = 0;                         /* position in c-string               */
1448
1449         if (!u) {
1450                 strcpy(buffer, "NULL");
1451                 return;
1452         }
1453
1454         endpos = UTF_END(u);
1455         utf_ptr = u->text;
1456
1457         while (utf_ptr < endpos)
1458                 /* copy next unicode character */
1459                 buffer[pos++] = utf_nextu2(&utf_ptr);
1460
1461         /* terminate string */
1462         buffer[pos] = '\0';
1463 }
1464
1465
1466 /* utf_sprint_convert_to_latin1_classname **************************************
1467
1468    Write utf symbol into c-string with `/' converted to `.' (for debugging
1469    purposes).
1470    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1471    invalid results.
1472
1473 *******************************************************************************/
1474
1475 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1476 {
1477         char *endpos;                       /* points behind utf string           */
1478         char *utf_ptr;                      /* current position in utf text       */
1479         u2 pos = 0;                         /* position in c-string               */
1480
1481         if (!u) {
1482                 strcpy(buffer, "NULL");
1483                 return;
1484         }
1485
1486         endpos = UTF_END(u);
1487         utf_ptr = u->text;
1488
1489         while (utf_ptr < endpos) {
1490                 /* copy next unicode character */
1491                 u2 c = utf_nextu2(&utf_ptr);
1492                 if (c == '/') c = '.';
1493                 buffer[pos++] = c;
1494         }
1495
1496         /* terminate string */
1497         buffer[pos] = '\0';
1498 }
1499
1500
1501 /* utf_strcat_convert_to_latin1 ************************************************
1502
1503    Like libc strcat, but uses an utf8 string.
1504    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1505    invalid results.
1506
1507 *******************************************************************************/
1508
1509 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1510 {
1511         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1512 }
1513
1514
1515 /* utf_strcat_convert_to_latin1_classname **************************************
1516
1517    Like libc strcat, but uses an utf8 string.
1518    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1519    invalid results.
1520
1521 *******************************************************************************/
1522
1523 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1524 {
1525         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1526 }
1527
1528
1529 /* utf_fprint_printable_ascii **************************************************
1530
1531    Write utf symbol into file.
1532    Non-printable and non-ASCII characters are printed as '?'.
1533
1534 *******************************************************************************/
1535
1536 void utf_fprint_printable_ascii(FILE *file, utf *u)
1537 {
1538         char *endpos;                       /* points behind utf string           */
1539         char *utf_ptr;                      /* current position in utf text       */
1540
1541         if (!u)
1542                 return;
1543
1544         endpos = UTF_END(u);
1545         utf_ptr = u->text;
1546
1547         while (utf_ptr < endpos) {
1548                 /* read next unicode character */
1549                 u2 c = utf_nextu2(&utf_ptr);
1550
1551                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1552                 else fprintf(file, "?");
1553         }
1554 }
1555
1556
1557 /* utf_fprint_printable_ascii_classname ****************************************
1558
1559    Write utf symbol into file with `/' converted to `.'.
1560    Non-printable and non-ASCII characters are printed as '?'.
1561
1562 *******************************************************************************/
1563
1564 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1565 {
1566         char *endpos;                       /* points behind utf string           */
1567         char *utf_ptr;                      /* current position in utf text       */
1568
1569     if (!u)
1570                 return;
1571
1572         endpos = UTF_END(u);
1573         utf_ptr = u->text;
1574
1575         while (utf_ptr < endpos) {
1576                 /* read next unicode character */
1577                 u2 c = utf_nextu2(&utf_ptr);
1578                 if (c == '/') c = '.';
1579
1580                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1581                 else fprintf(file, "?");
1582         }
1583 }
1584
1585
1586 /* is_valid_utf ****************************************************************
1587
1588    Return true if the given string is a valid UTF-8 string.
1589
1590    utf_ptr...points to first character
1591    end_pos...points after last character
1592
1593 *******************************************************************************/
1594
1595 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1596
1597 bool is_valid_utf(char *utf_ptr, char *end_pos)
1598 {
1599         int bytes;
1600         int len,i;
1601         char c;
1602         unsigned long v;
1603
1604         if (end_pos < utf_ptr) return false;
1605         bytes = end_pos - utf_ptr;
1606         while (bytes--) {
1607                 c = *utf_ptr++;
1608
1609                 if (!c) return false;                     /* 0x00 is not allowed */
1610                 if ((c & 0x80) == 0) continue;            /* ASCII */
1611
1612                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1613                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1614                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1615                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1616                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1617                 else return false;                        /* invalid leading byte */
1618
1619                 if (len > 2) return false;                /* Java limitation */
1620
1621                 v = (unsigned long)c & (0x3f >> len);
1622
1623                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1624
1625                 for (i = len; i--; ) {
1626                         c = *utf_ptr++;
1627                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1628                                 return false;
1629                         v = (v << 6) | (c & 0x3f);
1630                 }
1631
1632                 if (v == 0) {
1633                         if (len != 1) return false;           /* Java special */
1634
1635                 } else {
1636                         /* Sun Java seems to allow overlong UTF-8 encodings */
1637
1638                         /* if (v < min_codepoint[len]) */
1639                                 /* XXX throw exception? */
1640                 }
1641
1642                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1643                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1644
1645                 /* even these seem to be allowed */
1646                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1647         }
1648
1649         return true;
1650 }
1651
1652
1653 /* is_valid_name ***************************************************************
1654
1655    Return true if the given string may be used as a class/field/method
1656    name. (Currently this only disallows empty strings and control
1657    characters.)
1658
1659    NOTE: The string is assumed to have passed is_valid_utf!
1660
1661    utf_ptr...points to first character
1662    end_pos...points after last character
1663
1664 *******************************************************************************/
1665
1666 bool is_valid_name(char *utf_ptr, char *end_pos)
1667 {
1668         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1669
1670         while (utf_ptr < end_pos) {
1671                 unsigned char c = *utf_ptr++;
1672
1673                 if (c < 0x20) return false; /* disallow control characters */
1674                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1675                         return false;
1676         }
1677
1678         return true;
1679 }
1680
1681 bool is_valid_name_utf(utf *u)
1682 {
1683         return is_valid_name(u->text, UTF_END(u));
1684 }
1685
1686
1687 /* utf_show ********************************************************************
1688
1689    Writes the utf symbols in the utfhash to stdout and displays the
1690    number of external hash chains grouped according to the chainlength
1691    (for debugging purposes).
1692
1693 *******************************************************************************/
1694
1695 #if !defined(NDEBUG)
1696 void utf_show(void)
1697 {
1698
1699 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1700
1701         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1702         u4 max_chainlength = 0;      /* maximum length of the chains */
1703         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1704         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1705         u4 i;
1706
1707         printf("UTF-HASH:\n");
1708
1709         /* show element of utf-hashtable */
1710
1711         for (i = 0; i < hashtable_utf->size; i++) {
1712                 utf *u = hashtable_utf->ptr[i];
1713
1714                 if (u) {
1715                         printf("SLOT %d: ", (int) i);
1716
1717                         while (u) {
1718                                 printf("'");
1719                                 utf_display_printable_ascii(u);
1720                                 printf("' ");
1721                                 u = u->hashlink;
1722                         }
1723                         printf("\n");
1724                 }
1725         }
1726
1727         printf("UTF-HASH: %d slots for %d entries\n",
1728                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1729
1730         if (hashtable_utf->entries == 0)
1731                 return;
1732
1733         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1734
1735         for (i=0;i<CHAIN_LIMIT;i++)
1736                 chain_count[i]=0;
1737
1738         /* count numbers of hashchains according to their length */
1739         for (i=0; i<hashtable_utf->size; i++) {
1740
1741                 utf *u = (utf*) hashtable_utf->ptr[i];
1742                 u4 chain_length = 0;
1743
1744                 /* determine chainlength */
1745                 while (u) {
1746                         u = u->hashlink;
1747                         chain_length++;
1748                 }
1749
1750                 /* update sum of all chainlengths */
1751                 sum_chainlength+=chain_length;
1752
1753                 /* determine the maximum length of the chains */
1754                 if (chain_length>max_chainlength)
1755                         max_chainlength = chain_length;
1756
1757                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1758                 if (chain_length>=CHAIN_LIMIT) {
1759                         beyond_limit+=chain_length;
1760                         chain_length=CHAIN_LIMIT-1;
1761                 }
1762
1763                 /* update number of hashchains of current length */
1764                 chain_count[chain_length]++;
1765         }
1766
1767         /* display results */
1768         for (i=1;i<CHAIN_LIMIT-1;i++)
1769                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1770
1771         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1772
1773
1774         printf("max. chainlength:%5d\n",max_chainlength);
1775
1776         /* avg. chainlength = sum of chainlengths / number of chains */
1777         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1778 }
1779 #endif /* !defined(NDEBUG) */
1780
1781
1782 /*
1783  * These are local overrides for various environment variables in Emacs.
1784  * Please do not remove this and leave it at the end of the file, where
1785  * Emacs will automagically detect them.
1786  * ---------------------------------------------------------------------
1787  * Local variables:
1788  * mode: c
1789  * indent-tabs-mode: t
1790  * c-basic-offset: 4
1791  * tab-width: 4
1792  * End:
1793  * vim:noexpandtab:sw=4:ts=4:
1794  */