src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2158 2005-03-30 20:06:37Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/tables.h"
  45 #include "vm/utf8.h"
  46
  47
  48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  49
  50
  51 /* utf-symbols for pointer comparison of frequently used strings **************/
  52
  53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  54
  55 utf *utf_java_lang_Class;
  56 utf *utf_java_lang_ClassLoader;
  57 utf *utf_java_lang_Cloneable;
  58 utf *utf_java_lang_SecurityManager;
  59 utf *utf_java_lang_String;
  60 utf *utf_java_lang_System;
  61 utf *utf_java_io_Serializable;
  62
  63 utf *utf_java_lang_Throwable;
  64 utf *utf_java_lang_VMThrowable;
  65 utf *utf_java_lang_Exception;
  66 utf *utf_java_lang_Error;
  67 utf *utf_java_lang_OutOfMemoryError;
  68 utf *utf_java_lang_NoClassDefFoundError;
  69
  70 utf* utf_java_lang_Void;
  71 utf* utf_java_lang_Boolean;
  72 utf* utf_java_lang_Byte;
  73 utf* utf_java_lang_Character;
  74 utf* utf_java_lang_Short;
  75 utf* utf_java_lang_Integer;
  76 utf* utf_java_lang_Long;
  77 utf* utf_java_lang_Float;
  78 utf* utf_java_lang_Double;
  79
  80 utf *utf_java_util_Vector;
  81
  82 utf *utf_InnerClasses;                  /* InnerClasses                       */
  83 utf *utf_ConstantValue;                 /* ConstantValue                      */
  84 utf *utf_Code;                          /* Code                               */
  85 utf *utf_Exceptions;                    /* Exceptions                         */
  86 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  87 utf *utf_SourceFile;                    /* SourceFile                         */
  88
  89 utf *utf_init;                          /* <init>                             */
  90 utf *utf_clinit;                        /* <clinit>                           */
  91 utf *utf_finalize;                      /* finalize                           */
  92
  93 utf *utf_printStackTrace;
  94 utf *utf_fillInStackTrace;
  95 utf *utf_loadClass;
  96
  97 utf *utf_void__void;                    /* ()V                                */
  98 utf *utf_boolean__void;                 /* (Z)V                               */
  99 utf *utf_byte__void;                    /* (B)V                               */
 100 utf *utf_char__void;                    /* (C)V                               */
 101 utf *utf_short__void;                   /* (S)V                               */
 102 utf *utf_int__void;                     /* (I)V                               */
 103 utf *utf_long__void;                    /* (J)V                               */
 104 utf *utf_float__void;                   /* (F)V                               */
 105 utf *utf_double__void;                  /* (D)V                               */
 106 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 107 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 108 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 109 utf *utf_java_lang_String__java_lang_Class;
 110 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 111
 112 utf *array_packagename;
 113
 114
 115 /* utf_init ********************************************************************
 116
 117    Initializes the utf8 subsystem.
 118
 119 *******************************************************************************/
 120
 121 void utf8_init(void)
 122 {
 123         /* create utf-symbols for pointer comparison of frequently used strings */
 124
 125         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 126
 127         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 128         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 129         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 130         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 131         utf_java_lang_String           = utf_new_char("java/lang/String");
 132         utf_java_lang_System           = utf_new_char("java/lang/System");
 133         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 134
 135         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 136         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 137         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 138         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 139         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 140
 141         utf_java_lang_NoClassDefFoundError =
 142                 utf_new_char(string_java_lang_NoClassDefFoundError);
 143
 144         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 145         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 146         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 147         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 148         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 149         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 150         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 151         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 152         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 153
 154         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 155
 156         utf_InnerClasses               = utf_new_char("InnerClasses");
 157         utf_ConstantValue              = utf_new_char("ConstantValue");
 158         utf_Code                       = utf_new_char("Code");
 159         utf_Exceptions                 = utf_new_char("Exceptions");
 160         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 161         utf_SourceFile                 = utf_new_char("SourceFile");
 162
 163         utf_init                           = utf_new_char("<init>");
 164         utf_clinit                         = utf_new_char("<clinit>");
 165         utf_finalize                   = utf_new_char("finalize");
 166
 167         utf_printStackTrace            = utf_new_char("printStackTrace");
 168         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 169         utf_loadClass                  = utf_new_char("loadClass");
 170
 171         utf_void__void                 = utf_new_char("()V");
 172         utf_boolean__void              = utf_new_char("(Z)V");
 173         utf_byte__void                 = utf_new_char("(B)V");
 174         utf_char__void                 = utf_new_char("(C)V");
 175         utf_short__void                = utf_new_char("(S)V");
 176         utf_int__void                  = utf_new_char("(I)V");
 177         utf_long__void                 = utf_new_char("(J)V");
 178         utf_float__void                = utf_new_char("(F)V");
 179         utf_double__void               = utf_new_char("(D)V");
 180         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 181         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 182         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 183
 184         utf_java_lang_String__java_lang_Class =
 185                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 186
 187         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 188
 189         array_packagename              = utf_new_char("\t<the array package>");
 190 }
 191
 192
 193 /* utf_hashkey *****************************************************************
 194
 195    The hashkey is computed from the utf-text by using up to 8
 196    characters.  For utf-symbols longer than 15 characters 3 characters
 197    are taken from the beginning and the end, 2 characters are taken
 198    from the middle.
 199
 200 *******************************************************************************/
 201
 202 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 203 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 204
 205 u4 utf_hashkey(const char *text, u4 length)
 206 {
 207         const char *start_pos = text;       /* pointer to utf text                */
 208         u4 a;
 209
 210         switch (length) {
 211         case 0: /* empty string */
 212                 return 0;
 213
 214         case 1: return fbs(0);
 215         case 2: return fbs(0) ^ nbs(3);
 216         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 217         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 218         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 219         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 220         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 221         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 222
 223         case 9:
 224                 a = fbs(0);
 225                 a ^= nbs(1);
 226                 a ^= nbs(2);
 227                 text++;
 228                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 229
 230         case 10:
 231                 a = fbs(0);
 232                 text++;
 233                 a ^= nbs(2);
 234                 a ^= nbs(3);
 235                 a ^= nbs(4);
 236                 text++;
 237                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 238
 239         case 11:
 240                 a = fbs(0);
 241                 text++;
 242                 a ^= nbs(2);
 243                 a ^= nbs(3);
 244                 a ^= nbs(4);
 245                 text++;
 246                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 247
 248         case 12:
 249                 a = fbs(0);
 250                 text += 2;
 251                 a ^= nbs(2);
 252                 a ^= nbs(3);
 253                 text++;
 254                 a ^= nbs(5);
 255                 a ^= nbs(6);
 256                 a ^= nbs(7);
 257                 text++;
 258                 return a ^ nbs(9) ^ nbs(10);
 259
 260         case 13:
 261                 a = fbs(0);
 262                 a ^= nbs(1);
 263                 text++;
 264                 a ^= nbs(3);
 265                 a ^= nbs(4);
 266                 text += 2;
 267                 a ^= nbs(7);
 268                 a ^= nbs(8);
 269                 text += 2;
 270                 return a ^ nbs(9) ^ nbs(10);
 271
 272         case 14:
 273                 a = fbs(0);
 274                 text += 2;
 275                 a ^= nbs(3);
 276                 a ^= nbs(4);
 277                 text += 2;
 278                 a ^= nbs(7);
 279                 a ^= nbs(8);
 280                 text += 2;
 281                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 282
 283         case 15:
 284                 a = fbs(0);
 285                 text += 2;
 286                 a ^= nbs(3);
 287                 a ^= nbs(4);
 288                 text += 2;
 289                 a ^= nbs(7);
 290                 a ^= nbs(8);
 291                 text += 2;
 292                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 293
 294         default:  /* 3 characters from beginning */
 295                 a = fbs(0);
 296                 text += 2;
 297                 a ^= nbs(3);
 298                 a ^= nbs(4);
 299
 300                 /* 2 characters from middle */
 301                 text = start_pos + (length / 2);
 302                 a ^= fbs(5);
 303                 text += 2;
 304                 a ^= nbs(6);
 305
 306                 /* 3 characters from end */
 307                 text = start_pos + length - 4;
 308
 309                 a ^= fbs(7);
 310                 text++;
 311
 312                 return a ^ nbs(10) ^ nbs(11);
 313     }
 314 }
 315
 316
 317 /* utf_hashkey *****************************************************************
 318
 319    Compute the hashkey of a unicode string.
 320
 321 *******************************************************************************/
 322
 323 u4 unicode_hashkey(u2 *text, u2 len)
 324 {
 325         return utf_hashkey((char *) text, len);
 326 }
 327
 328
 329 /* utf_new *********************************************************************
 330
 331    Creates a new utf-symbol, the text of the symbol is passed as a
 332    u1-array. The function searches the utf-hashtable for a utf-symbol
 333    with this text. On success the element returned, otherwise a new
 334    hashtable element is created.
 335
 336    If the number of entries in the hashtable exceeds twice the size of
 337    the hashtable slots a reorganization of the hashtable is done and
 338    the utf symbols are copied to a new hashtable with doubled size.
 339
 340 *******************************************************************************/
 341
 342 utf *utf_new_intern(const char *text, u2 length);
 343
 344 utf *utf_new(const char *text, u2 length)
 345 {
 346     utf *r;
 347
 348 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 349     tables_lock();
 350 #endif
 351
 352     r = utf_new_intern(text, length);
 353
 354 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 355     tables_unlock();
 356 #endif
 357
 358     return r;
 359 }
 360
 361
 362 utf *utf_new_intern(const char *text, u2 length)
 363 {
 364         u4 key;                             /* hashkey computed from utf-text     */
 365         u4 slot;                            /* slot in hashtable                  */
 366         utf *u;                             /* hashtable element                  */
 367         u2 i;
 368
 369 #ifdef STATISTICS
 370         if (opt_stat)
 371                 count_utf_new++;
 372 #endif
 373
 374         key  = utf_hashkey(text, length);
 375         slot = key & (utf_hash.size - 1);
 376         u    = utf_hash.ptr[slot];
 377
 378         /* search external hash chain for utf-symbol */
 379         while (u) {
 380                 if (u->blength == length) {
 381
 382                         /* compare text of hashtable elements */
 383                         for (i = 0; i < length; i++)
 384                                 if (text[i] != u->text[i]) goto nomatch;
 385
 386 #ifdef STATISTICS
 387                         if (opt_stat)
 388                                 count_utf_new_found++;
 389 #endif
 390
 391                         /* symbol found in hashtable */
 392                         return u;
 393                 }
 394         nomatch:
 395                 u = u->hashlink; /* next element in external chain */
 396         }
 397
 398 #ifdef STATISTICS
 399         if (opt_stat)
 400                 count_utf_len += sizeof(utf) + length;
 401 #endif
 402
 403         /* location in hashtable found, create new utf element */
 404         u = NEW(utf);
 405         u->blength  = length;               /* length in bytes of utfstring       */
 406         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 407         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 408         memcpy(u->text, text, length);      /* copy utf-text                      */
 409         u->text[length] = '\0';
 410         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 411
 412         utf_hash.entries++;                 /* update number of entries           */
 413
 414         if (utf_hash.entries > (utf_hash.size * 2)) {
 415
 416         /* reorganization of hashtable, average length of
 417            the external chains is approx. 2                */
 418
 419                 u4 i;
 420                 utf *u;
 421                 hashtable newhash; /* the new hashtable */
 422
 423                 /* create new hashtable, double the size */
 424                 init_hashtable(&newhash, utf_hash.size * 2);
 425                 newhash.entries = utf_hash.entries;
 426
 427 #ifdef STATISTICS
 428                 if (opt_stat)
 429                         count_utf_len += sizeof(utf*) * utf_hash.size;
 430 #endif
 431
 432                 /* transfer elements to new hashtable */
 433                 for (i = 0; i < utf_hash.size; i++) {
 434                         u = (utf *) utf_hash.ptr[i];
 435                         while (u) {
 436                                 utf *nextu = u->hashlink;
 437                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 438
 439                                 u->hashlink = (utf *) newhash.ptr[slot];
 440                                 newhash.ptr[slot] = u;
 441
 442                                 /* follow link in external hash chain */
 443                                 u = nextu;
 444                         }
 445                 }
 446
 447                 /* dispose old table */
 448                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 449                 utf_hash = newhash;
 450         }
 451
 452         return u;
 453 }
 454
 455
 456 /* utf_new_u2 ******************************************************************
 457
 458    Make utf symbol from u2 array, if isclassname is true '.' is
 459    replaced by '/'.
 460
 461 *******************************************************************************/
 462
 463 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 464 {
 465         char *buffer;                   /* memory buffer for  unicode characters  */
 466         char *pos;                      /* pointer to current position in buffer  */
 467         u4 left;                        /* unicode characters left                */
 468         u4 buflength;                   /* utf length in bytes of the u2 array    */
 469         utf *result;                    /* resulting utf-string                   */
 470         int i;
 471
 472         /* determine utf length in bytes and allocate memory */
 473
 474         buflength = u2_utflength(unicode_pos, unicode_length);
 475         buffer    = MNEW(char, buflength);
 476
 477         left = buflength;
 478         pos  = buffer;
 479
 480         for (i = 0; i++ < unicode_length; unicode_pos++) {
 481                 /* next unicode character */
 482                 u2 c = *unicode_pos;
 483
 484                 if ((c != 0) && (c < 0x80)) {
 485                         /* 1 character */
 486                         left--;
 487                 if ((int) left < 0) break;
 488                         /* convert classname */
 489                         if (isclassname && c == '.')
 490                                 *pos++ = '/';
 491                         else
 492                                 *pos++ = (char) c;
 493
 494                 } else if (c < 0x800) {
 495                         /* 2 characters */
 496                 unsigned char high = c >> 6;
 497                 unsigned char low  = c & 0x3F;
 498                         left = left - 2;
 499                 if ((int) left < 0) break;
 500                 *pos++ = high | 0xC0;
 501                 *pos++ = low  | 0x80;
 502
 503                 } else {
 504                 /* 3 characters */
 505                 char low  = c & 0x3f;
 506                 char mid  = (c >> 6) & 0x3F;
 507                 char high = c >> 12;
 508                         left = left - 3;
 509                 if ((int) left < 0) break;
 510                 *pos++ = high | 0xE0;
 511                 *pos++ = mid  | 0x80;
 512                 *pos++ = low  | 0x80;
 513                 }
 514         }
 515
 516         /* insert utf-string into symbol-table */
 517         result = utf_new(buffer,buflength);
 518
 519         MFREE(buffer, char, buflength);
 520
 521         return result;
 522 }
 523
 524
 525 /* utf_new_char ****************************************************************
 526
 527    Creates a new utf symbol, the text for this symbol is passed as a
 528    c-string ( = char* ).
 529
 530 *******************************************************************************/
 531
 532 utf *utf_new_char(const char *text)
 533 {
 534         return utf_new(text, strlen(text));
 535 }
 536
 537
 538 /* utf_new_char_classname ******************************************************
 539
 540    Creates a new utf symbol, the text for this symbol is passed as a
 541    c-string ( = char* ) "." characters are going to be replaced by
 542    "/". Since the above function is used often, this is a separte
 543    function, instead of an if.
 544
 545 *******************************************************************************/
 546
 547 utf *utf_new_char_classname(const char *text)
 548 {
 549         if (strchr(text, '.')) {
 550                 char *txt = strdup(text);
 551                 char *end = txt + strlen(txt);
 552                 char *c;
 553                 utf *tmpRes;
 554
 555                 for (c = txt; c < end; c++)
 556                         if (*c == '.') *c = '/';
 557
 558                 tmpRes = utf_new(txt, strlen(txt));
 559                 FREE(txt, 0);
 560
 561                 return tmpRes;
 562
 563         } else
 564                 return utf_new(text, strlen(text));
 565 }
 566
 567
 568 /* utf_nextu2 ******************************************************************
 569
 570    Read the next unicode character from the utf string and increment
 571    the utf-string pointer accordingly.
 572
 573 *******************************************************************************/
 574
 575 u2 utf_nextu2(char **utf_ptr)
 576 {
 577     /* uncompressed unicode character */
 578     u2 unicode_char = 0;
 579     /* current position in utf text */
 580     unsigned char *utf = (unsigned char *) (*utf_ptr);
 581     /* bytes representing the unicode character */
 582     unsigned char ch1, ch2, ch3;
 583     /* number of bytes used to represent the unicode character */
 584     int len = 0;
 585
 586     switch ((ch1 = utf[0]) >> 4) {
 587         default: /* 1 byte */
 588                 (*utf_ptr)++;
 589                 return (u2) ch1;
 590         case 0xC:
 591         case 0xD: /* 2 bytes */
 592                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 593                         unsigned char high = ch1 & 0x1F;
 594                         unsigned char low  = ch2 & 0x3F;
 595                         unicode_char = (high << 6) + low;
 596                         len = 2;
 597                 }
 598                 break;
 599
 600         case 0xE: /* 2 or 3 bytes */
 601                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 602                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 603                                 unsigned char low  = ch3 & 0x3f;
 604                                 unsigned char mid  = ch2 & 0x3f;
 605                                 unsigned char high = ch1 & 0x0f;
 606                                 unicode_char = (((high << 6) + mid) << 6) + low;
 607                                 len = 3;
 608                         } else
 609                                 len = 2;
 610                 }
 611                 break;
 612     }
 613
 614     /* update position in utf-text */
 615     *utf_ptr = (char *) (utf + len);
 616
 617     return unicode_char;
 618 }
 619
 620
 621 /* utf_strlen ******************************************************************
 622
 623    Determine number of unicode characters in the utf string.
 624
 625 *******************************************************************************/
 626
 627 u4 utf_strlen(utf *u)
 628 {
 629         char *endpos;                       /* points behind utf string           */
 630         char *utf_ptr;                      /* current position in utf text       */
 631         u4 len = 0;                         /* number of unicode characters       */
 632
 633         if (!u) {
 634                 *exceptionptr = new_nullpointerexception();
 635                 return 0;
 636         }
 637
 638         endpos = utf_end(u);
 639         utf_ptr = u->text;
 640
 641         while (utf_ptr < endpos) {
 642                 len++;
 643                 /* next unicode character */
 644                 utf_nextu2(&utf_ptr);
 645         }
 646
 647         if (utf_ptr != endpos)
 648                 /* string ended abruptly */
 649                 throw_cacao_exception_exit(string_java_lang_InternalError,
 650                                                                    "Illegal utf8 string");
 651
 652         return len;
 653 }
 654
 655
 656 /* u2_utflength ****************************************************************
 657
 658    Returns the utf length in bytes of a u2 array.
 659
 660 *******************************************************************************/
 661
 662 u4 u2_utflength(u2 *text, u4 u2_length)
 663 {
 664         u4 result_len = 0;                  /* utf length in bytes                */
 665         u2 ch;                              /* current unicode character          */
 666         u4 len;
 667
 668         for (len = 0; len < u2_length; len++) {
 669                 /* next unicode character */
 670                 ch = *text++;
 671
 672                 /* determine bytes required to store unicode character as utf */
 673                 if (ch && (ch < 0x80))
 674                         result_len++;
 675                 else if (ch < 0x800)
 676                         result_len += 2;
 677                 else
 678                         result_len += 3;
 679         }
 680
 681     return result_len;
 682 }
 683
 684
 685 /* utf_display *****************************************************************
 686
 687    Write utf symbol to stdout (for debugging purposes).
 688
 689 *******************************************************************************/
 690
 691 void utf_display(utf *u)
 692 {
 693         char *endpos;                       /* points behind utf string           */
 694         char *utf_ptr;                      /* current position in utf text       */
 695
 696         if (!u) {
 697                 printf("NULL");
 698                 fflush(stdout);
 699                 return;
 700         }
 701
 702         endpos = utf_end(u);
 703         utf_ptr = u->text;
 704
 705         while (utf_ptr < endpos) {
 706                 /* read next unicode character */
 707                 u2 c = utf_nextu2(&utf_ptr);
 708                 if (c >= 32 && c <= 127) printf("%c", c);
 709                 else printf("?");
 710         }
 711
 712         fflush(stdout);
 713 }
 714
 715
 716 /* utf_display_classname *******************************************************
 717
 718    Write utf symbol to stdout with `/' converted to `.' (for debugging
 719    purposes).
 720
 721 *******************************************************************************/
 722
 723 void utf_display_classname(utf *u)
 724 {
 725         char *endpos;                       /* points behind utf string           */
 726         char *utf_ptr;                      /* current position in utf text       */
 727
 728         if (!u) {
 729                 printf("NULL");
 730                 fflush(stdout);
 731                 return;
 732         }
 733
 734         endpos = utf_end(u);
 735         utf_ptr = u->text;
 736
 737         while (utf_ptr < endpos) {
 738                 /* read next unicode character */
 739                 u2 c = utf_nextu2(&utf_ptr);
 740                 if (c == '/') c = '.';
 741                 if (c >= 32 && c <= 127) printf("%c", c);
 742                 else printf("?");
 743         }
 744
 745         fflush(stdout);
 746 }
 747
 748
 749 /* utf_sprint ******************************************************************
 750
 751    Write utf symbol into c-string (for debugging purposes).
 752
 753 *******************************************************************************/
 754
 755 void utf_sprint(char *buffer, utf *u)
 756 {
 757         char *endpos;                       /* points behind utf string           */
 758         char *utf_ptr;                      /* current position in utf text       */
 759         u2 pos = 0;                         /* position in c-string               */
 760
 761         if (!u) {
 762                 strcpy(buffer, "NULL");
 763                 return;
 764         }
 765
 766         endpos = utf_end(u);
 767         utf_ptr = u->text;
 768
 769         while (utf_ptr < endpos)
 770                 /* copy next unicode character */
 771                 buffer[pos++] = utf_nextu2(&utf_ptr);
 772
 773         /* terminate string */
 774         buffer[pos] = '\0';
 775 }
 776
 777
 778 /* utf_sprint_classname ********************************************************
 779
 780    Write utf symbol into c-string with `/' converted to `.' (for debugging
 781    purposes).
 782
 783 *******************************************************************************/
 784
 785 void utf_sprint_classname(char *buffer, utf *u)
 786 {
 787         char *endpos;                       /* points behind utf string           */
 788         char *utf_ptr;                      /* current position in utf text       */
 789         u2 pos = 0;                         /* position in c-string               */
 790
 791         if (!u) {
 792                 strcpy(buffer, "NULL");
 793                 return;
 794         }
 795
 796         endpos = utf_end(u);
 797         utf_ptr = u->text;
 798
 799         while (utf_ptr < endpos) {
 800                 /* copy next unicode character */
 801                 u2 c = utf_nextu2(&utf_ptr);
 802                 if (c == '/') c = '.';
 803                 buffer[pos++] = c;
 804         }
 805
 806         /* terminate string */
 807         buffer[pos] = '\0';
 808 }
 809
 810
 811 /* utf_strcat ******************************************************************
 812
 813    Like libc strcat, but uses an utf8 string.
 814
 815 *******************************************************************************/
 816
 817 void utf_strcat(char *buffer, utf *u)
 818 {
 819         utf_sprint(buffer + strlen(buffer), u);
 820 }
 821
 822
 823 /* utf_strcat_classname ********************************************************
 824
 825    Like libc strcat, but uses an utf8 string.
 826
 827 *******************************************************************************/
 828
 829 void utf_strcat_classname(char *buffer, utf *u)
 830 {
 831         utf_sprint_classname(buffer + strlen(buffer), u);
 832 }
 833
 834
 835 /* utf_fprint ******************************************************************
 836
 837    Write utf symbol into file.
 838
 839 *******************************************************************************/
 840
 841 void utf_fprint(FILE *file, utf *u)
 842 {
 843         char *endpos;                       /* points behind utf string           */
 844         char *utf_ptr;                      /* current position in utf text       */
 845
 846         if (!u)
 847                 return;
 848
 849         endpos = utf_end(u);
 850         utf_ptr = u->text;
 851
 852         while (utf_ptr < endpos) {
 853                 /* read next unicode character */
 854                 u2 c = utf_nextu2(&utf_ptr);
 855
 856                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 857                 else fprintf(file, "?");
 858         }
 859 }
 860
 861
 862 /* utf_fprint_classname ********************************************************
 863
 864    Write utf symbol into file with `/' converted to `.'.
 865
 866 *******************************************************************************/
 867
 868 void utf_fprint_classname(FILE *file, utf *u)
 869 {
 870         char *endpos;                       /* points behind utf string           */
 871         char *utf_ptr;                      /* current position in utf text       */
 872
 873     if (!u)
 874                 return;
 875
 876         endpos = utf_end(u);
 877         utf_ptr = u->text;
 878
 879         while (utf_ptr < endpos) {
 880                 /* read next unicode character */
 881                 u2 c = utf_nextu2(&utf_ptr);
 882                 if (c == '/') c = '.';
 883
 884                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 885                 else fprintf(file, "?");
 886         }
 887 }
 888
 889
 890 /* is_valid_utf ****************************************************************
 891
 892    Return true if the given string is a valid UTF-8 string.
 893
 894    utf_ptr...points to first character
 895    end_pos...points after last character
 896
 897 *******************************************************************************/
 898
 899 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 900
 901 bool is_valid_utf(char *utf_ptr, char *end_pos)
 902 {
 903         int bytes;
 904         int len,i;
 905         char c;
 906         unsigned long v;
 907
 908         if (end_pos < utf_ptr) return false;
 909         bytes = end_pos - utf_ptr;
 910         while (bytes--) {
 911                 c = *utf_ptr++;
 912
 913                 if (!c) return false;                     /* 0x00 is not allowed */
 914                 if ((c & 0x80) == 0) continue;            /* ASCII */
 915
 916                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 917                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 918                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 919                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 920                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 921                 else return false;                        /* invalid leading byte */
 922
 923                 if (len > 2) return false;                /* Java limitation */
 924
 925                 v = (unsigned long)c & (0x3f >> len);
 926
 927                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 928
 929                 for (i = len; i--; ) {
 930                         c = *utf_ptr++;
 931                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 932                                 return false;
 933                         v = (v << 6) | (c & 0x3f);
 934                 }
 935
 936                 if (v == 0) {
 937                         if (len != 1) return false;           /* Java special */
 938
 939                 } else {
 940                         /* Sun Java seems to allow overlong UTF-8 encodings */
 941
 942                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 943                                 if (!opt_liberalutf)
 944                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 945                                 /* XXX change this to panic? */
 946                         }
 947                 }
 948
 949                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 950                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 951
 952                 /* even these seem to be allowed */
 953                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 954         }
 955
 956         return true;
 957 }
 958
 959
 960 /* is_valid_name ***************************************************************
 961
 962    Return true if the given string may be used as a class/field/method
 963    name. (Currently this only disallows empty strings and control
 964    characters.)
 965
 966    NOTE: The string is assumed to have passed is_valid_utf!
 967
 968    utf_ptr...points to first character
 969    end_pos...points after last character
 970
 971 *******************************************************************************/
 972
 973 bool is_valid_name(char *utf_ptr, char *end_pos)
 974 {
 975         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 976
 977         while (utf_ptr < end_pos) {
 978                 unsigned char c = *utf_ptr++;
 979
 980                 if (c < 0x20) return false; /* disallow control characters */
 981                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
 982                         return false;
 983         }
 984
 985         return true;
 986 }
 987
 988 bool is_valid_name_utf(utf *u)
 989 {
 990         return is_valid_name(u->text,utf_end(u));
 991 }
 992
 993
 994 /* utf_show ********************************************************************
 995
 996    Writes the utf symbols in the utfhash to stdout and displays the
 997    number of external hash chains grouped according to the chainlength
 998    (for debugging purposes).
 999
1000 *******************************************************************************/
1001
1002 void utf_show(void)
1003 {
1004
1005 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1006
1007         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1008         u4 max_chainlength = 0;      /* maximum length of the chains */
1009         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1010         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1011         u4 i;
1012
1013         printf ("UTF-HASH:\n");
1014
1015         /* show element of utf-hashtable */
1016         for (i=0; i<utf_hash.size; i++) {
1017                 utf *u = utf_hash.ptr[i];
1018                 if (u) {
1019                         printf ("SLOT %d: ", (int) i);
1020                         while (u) {
1021                                 printf ("'");
1022                                 utf_display (u);
1023                                 printf ("' ");
1024                                 u = u->hashlink;
1025                         }
1026                         printf ("\n");
1027                 }
1028
1029         }
1030
1031         printf ("UTF-HASH: %d slots for %d entries\n",
1032                         (int) utf_hash.size, (int) utf_hash.entries );
1033
1034
1035         if (utf_hash.entries == 0)
1036                 return;
1037
1038         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1039
1040         for (i=0;i<CHAIN_LIMIT;i++)
1041                 chain_count[i]=0;
1042
1043         /* count numbers of hashchains according to their length */
1044         for (i=0; i<utf_hash.size; i++) {
1045
1046                 utf *u = (utf*) utf_hash.ptr[i];
1047                 u4 chain_length = 0;
1048
1049                 /* determine chainlength */
1050                 while (u) {
1051                         u = u->hashlink;
1052                         chain_length++;
1053                 }
1054
1055                 /* update sum of all chainlengths */
1056                 sum_chainlength+=chain_length;
1057
1058                 /* determine the maximum length of the chains */
1059                 if (chain_length>max_chainlength)
1060                         max_chainlength = chain_length;
1061
1062                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1063                 if (chain_length>=CHAIN_LIMIT) {
1064                         beyond_limit+=chain_length;
1065                         chain_length=CHAIN_LIMIT-1;
1066                 }
1067
1068                 /* update number of hashchains of current length */
1069                 chain_count[chain_length]++;
1070         }
1071
1072         /* display results */
1073         for (i=1;i<CHAIN_LIMIT-1;i++)
1074                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1075
1076         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1077
1078
1079         printf("max. chainlength:%5d\n",max_chainlength);
1080
1081         /* avg. chainlength = sum of chainlengths / number of chains */
1082         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1083 }
1084
1085
1086 /*
1087  * These are local overrides for various environment variables in Emacs.
1088  * Please do not remove this and leave it at the end of the file, where
1089  * Emacs will automagically detect them.
1090  * ---------------------------------------------------------------------
1091  * Local variables:
1092  * mode: c
1093  * indent-tabs-mode: t
1094  * c-basic-offset: 4
1095  * tab-width: 4
1096  * End:
1097  */