src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2148 2005-03-30 16:49:40Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/tables.h"
  45 #include "vm/utf8.h"
  46
  47
  48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  49
  50
  51 /* utf-symbols for pointer comparison of frequently used strings **************/
  52
  53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  54
  55 utf *utf_java_lang_Class;
  56 utf *utf_java_lang_ClassLoader;
  57 utf *utf_java_lang_Cloneable;
  58 utf *utf_java_lang_SecurityManager;
  59 utf *utf_java_lang_String;
  60 utf *utf_java_lang_System;
  61 utf *utf_java_io_Serializable;
  62
  63 utf *utf_java_lang_Throwable;
  64 utf *utf_java_lang_VMThrowable;
  65 utf *utf_java_lang_Exception;
  66 utf *utf_java_lang_Error;
  67 utf *utf_java_lang_OutOfMemoryError;
  68
  69 utf* utf_java_lang_Void;
  70 utf* utf_java_lang_Boolean;
  71 utf* utf_java_lang_Byte;
  72 utf* utf_java_lang_Character;
  73 utf* utf_java_lang_Short;
  74 utf* utf_java_lang_Integer;
  75 utf* utf_java_lang_Long;
  76 utf* utf_java_lang_Float;
  77 utf* utf_java_lang_Double;
  78
  79 utf *utf_java_util_Vector;
  80
  81 utf *utf_InnerClasses;                  /* InnerClasses                       */
  82 utf *utf_ConstantValue;                 /* ConstantValue                      */
  83 utf *utf_Code;                          /* Code                               */
  84 utf *utf_Exceptions;                    /* Exceptions                         */
  85 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  86 utf *utf_SourceFile;                    /* SourceFile                         */
  87
  88 utf *utf_init;                          /* <init>                             */
  89 utf *utf_clinit;                        /* <clinit>                           */
  90 utf *utf_finalize;                      /* finalize                           */
  91
  92 utf *utf_printStackTrace;
  93 utf *utf_fillInStackTrace;
  94 utf *utf_loadClass;
  95
  96 utf *utf_void__void;                    /* ()V                                */
  97 utf *utf_boolean__void;                 /* (Z)V                               */
  98 utf *utf_byte__void;                    /* (B)V                               */
  99 utf *utf_char__void;                    /* (C)V                               */
 100 utf *utf_short__void;                   /* (S)V                               */
 101 utf *utf_int__void;                     /* (I)V                               */
 102 utf *utf_long__void;                    /* (J)V                               */
 103 utf *utf_float__void;                   /* (F)V                               */
 104 utf *utf_double__void;                  /* (D)V                               */
 105 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 106 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 107 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 108 utf *utf_java_lang_String__java_lang_Class;
 109 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 110
 111 utf *array_packagename;
 112
 113
 114 /* utf_init ********************************************************************
 115
 116    Initializes the utf8 subsystem.
 117
 118 *******************************************************************************/
 119
 120 void utf8_init(void)
 121 {
 122         /* create utf-symbols for pointer comparison of frequently used strings */
 123
 124         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 125
 126         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 127         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 128         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 129         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 130         utf_java_lang_String           = utf_new_char("java/lang/String");
 131         utf_java_lang_System           = utf_new_char("java/lang/System");
 132         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 133
 134         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 135         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 136         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 137         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 138         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 139
 140         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 141         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 142         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 143         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 144         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 145         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 146         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 147         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 148         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 149
 150         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 151
 152         utf_InnerClasses               = utf_new_char("InnerClasses");
 153         utf_ConstantValue              = utf_new_char("ConstantValue");
 154         utf_Code                       = utf_new_char("Code");
 155         utf_Exceptions                 = utf_new_char("Exceptions");
 156         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 157         utf_SourceFile                 = utf_new_char("SourceFile");
 158
 159         utf_init                           = utf_new_char("<init>");
 160         utf_clinit                         = utf_new_char("<clinit>");
 161         utf_finalize                   = utf_new_char("finalize");
 162
 163         utf_printStackTrace            = utf_new_char("printStackTrace");
 164         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 165         utf_loadClass                  = utf_new_char("loadClass");
 166
 167         utf_void__void                 = utf_new_char("()V");
 168         utf_boolean__void              = utf_new_char("(Z)V");
 169         utf_byte__void                 = utf_new_char("(B)V");
 170         utf_char__void                 = utf_new_char("(C)V");
 171         utf_short__void                = utf_new_char("(S)V");
 172         utf_int__void                  = utf_new_char("(I)V");
 173         utf_long__void                 = utf_new_char("(J)V");
 174         utf_float__void                = utf_new_char("(F)V");
 175         utf_double__void               = utf_new_char("(D)V");
 176         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 177         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 178         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 179
 180         utf_java_lang_String__java_lang_Class =
 181                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 182
 183         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 184
 185         array_packagename              = utf_new_char("\t<the array package>");
 186 }
 187
 188
 189 /* utf_hashkey *****************************************************************
 190
 191    The hashkey is computed from the utf-text by using up to 8
 192    characters.  For utf-symbols longer than 15 characters 3 characters
 193    are taken from the beginning and the end, 2 characters are taken
 194    from the middle.
 195
 196 *******************************************************************************/
 197
 198 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 199 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 200
 201 u4 utf_hashkey(const char *text, u4 length)
 202 {
 203         const char *start_pos = text;       /* pointer to utf text                */
 204         u4 a;
 205
 206         switch (length) {
 207         case 0: /* empty string */
 208                 return 0;
 209
 210         case 1: return fbs(0);
 211         case 2: return fbs(0) ^ nbs(3);
 212         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 213         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 214         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 215         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 216         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 217         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 218
 219         case 9:
 220                 a = fbs(0);
 221                 a ^= nbs(1);
 222                 a ^= nbs(2);
 223                 text++;
 224                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 225
 226         case 10:
 227                 a = fbs(0);
 228                 text++;
 229                 a ^= nbs(2);
 230                 a ^= nbs(3);
 231                 a ^= nbs(4);
 232                 text++;
 233                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 234
 235         case 11:
 236                 a = fbs(0);
 237                 text++;
 238                 a ^= nbs(2);
 239                 a ^= nbs(3);
 240                 a ^= nbs(4);
 241                 text++;
 242                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 243
 244         case 12:
 245                 a = fbs(0);
 246                 text += 2;
 247                 a ^= nbs(2);
 248                 a ^= nbs(3);
 249                 text++;
 250                 a ^= nbs(5);
 251                 a ^= nbs(6);
 252                 a ^= nbs(7);
 253                 text++;
 254                 return a ^ nbs(9) ^ nbs(10);
 255
 256         case 13:
 257                 a = fbs(0);
 258                 a ^= nbs(1);
 259                 text++;
 260                 a ^= nbs(3);
 261                 a ^= nbs(4);
 262                 text += 2;
 263                 a ^= nbs(7);
 264                 a ^= nbs(8);
 265                 text += 2;
 266                 return a ^ nbs(9) ^ nbs(10);
 267
 268         case 14:
 269                 a = fbs(0);
 270                 text += 2;
 271                 a ^= nbs(3);
 272                 a ^= nbs(4);
 273                 text += 2;
 274                 a ^= nbs(7);
 275                 a ^= nbs(8);
 276                 text += 2;
 277                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 278
 279         case 15:
 280                 a = fbs(0);
 281                 text += 2;
 282                 a ^= nbs(3);
 283                 a ^= nbs(4);
 284                 text += 2;
 285                 a ^= nbs(7);
 286                 a ^= nbs(8);
 287                 text += 2;
 288                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 289
 290         default:  /* 3 characters from beginning */
 291                 a = fbs(0);
 292                 text += 2;
 293                 a ^= nbs(3);
 294                 a ^= nbs(4);
 295
 296                 /* 2 characters from middle */
 297                 text = start_pos + (length / 2);
 298                 a ^= fbs(5);
 299                 text += 2;
 300                 a ^= nbs(6);
 301
 302                 /* 3 characters from end */
 303                 text = start_pos + length - 4;
 304
 305                 a ^= fbs(7);
 306                 text++;
 307
 308                 return a ^ nbs(10) ^ nbs(11);
 309     }
 310 }
 311
 312
 313 /* utf_hashkey *****************************************************************
 314
 315    Compute the hashkey of a unicode string.
 316
 317 *******************************************************************************/
 318
 319 u4 unicode_hashkey(u2 *text, u2 len)
 320 {
 321         return utf_hashkey((char *) text, len);
 322 }
 323
 324
 325 /* utf_new *********************************************************************
 326
 327    Creates a new utf-symbol, the text of the symbol is passed as a
 328    u1-array. The function searches the utf-hashtable for a utf-symbol
 329    with this text. On success the element returned, otherwise a new
 330    hashtable element is created.
 331
 332    If the number of entries in the hashtable exceeds twice the size of
 333    the hashtable slots a reorganization of the hashtable is done and
 334    the utf symbols are copied to a new hashtable with doubled size.
 335
 336 *******************************************************************************/
 337
 338 utf *utf_new_intern(const char *text, u2 length);
 339
 340 utf *utf_new(const char *text, u2 length)
 341 {
 342     utf *r;
 343
 344 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 345     tables_lock();
 346 #endif
 347
 348     r = utf_new_intern(text, length);
 349
 350 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 351     tables_unlock();
 352 #endif
 353
 354     return r;
 355 }
 356
 357
 358 utf *utf_new_intern(const char *text, u2 length)
 359 {
 360         u4 key;                             /* hashkey computed from utf-text     */
 361         u4 slot;                            /* slot in hashtable                  */
 362         utf *u;                             /* hashtable element                  */
 363         u2 i;
 364
 365 #ifdef STATISTICS
 366         if (opt_stat)
 367                 count_utf_new++;
 368 #endif
 369
 370         key  = utf_hashkey(text, length);
 371         slot = key & (utf_hash.size - 1);
 372         u    = utf_hash.ptr[slot];
 373
 374         /* search external hash chain for utf-symbol */
 375         while (u) {
 376                 if (u->blength == length) {
 377
 378                         /* compare text of hashtable elements */
 379                         for (i = 0; i < length; i++)
 380                                 if (text[i] != u->text[i]) goto nomatch;
 381
 382 #ifdef STATISTICS
 383                         if (opt_stat)
 384                                 count_utf_new_found++;
 385 #endif
 386
 387                         /* symbol found in hashtable */
 388                         return u;
 389                 }
 390         nomatch:
 391                 u = u->hashlink; /* next element in external chain */
 392         }
 393
 394 #ifdef STATISTICS
 395         if (opt_stat)
 396                 count_utf_len += sizeof(utf) + length;
 397 #endif
 398
 399         /* location in hashtable found, create new utf element */
 400         u = NEW(utf);
 401         u->blength  = length;               /* length in bytes of utfstring       */
 402         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 403         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 404         memcpy(u->text, text, length);      /* copy utf-text                      */
 405         u->text[length] = '\0';
 406         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 407
 408         utf_hash.entries++;                 /* update number of entries           */
 409
 410         if (utf_hash.entries > (utf_hash.size * 2)) {
 411
 412         /* reorganization of hashtable, average length of
 413            the external chains is approx. 2                */
 414
 415                 u4 i;
 416                 utf *u;
 417                 hashtable newhash; /* the new hashtable */
 418
 419                 /* create new hashtable, double the size */
 420                 init_hashtable(&newhash, utf_hash.size * 2);
 421                 newhash.entries = utf_hash.entries;
 422
 423 #ifdef STATISTICS
 424                 if (opt_stat)
 425                         count_utf_len += sizeof(utf*) * utf_hash.size;
 426 #endif
 427
 428                 /* transfer elements to new hashtable */
 429                 for (i = 0; i < utf_hash.size; i++) {
 430                         u = (utf *) utf_hash.ptr[i];
 431                         while (u) {
 432                                 utf *nextu = u->hashlink;
 433                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 434
 435                                 u->hashlink = (utf *) newhash.ptr[slot];
 436                                 newhash.ptr[slot] = u;
 437
 438                                 /* follow link in external hash chain */
 439                                 u = nextu;
 440                         }
 441                 }
 442
 443                 /* dispose old table */
 444                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 445                 utf_hash = newhash;
 446         }
 447
 448         return u;
 449 }
 450
 451
 452 /* utf_new_u2 ******************************************************************
 453
 454    Make utf symbol from u2 array, if isclassname is true '.' is
 455    replaced by '/'.
 456
 457 *******************************************************************************/
 458
 459 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 460 {
 461         char *buffer;                   /* memory buffer for  unicode characters  */
 462         char *pos;                      /* pointer to current position in buffer  */
 463         u4 left;                        /* unicode characters left                */
 464         u4 buflength;                   /* utf length in bytes of the u2 array    */
 465         utf *result;                    /* resulting utf-string                   */
 466         int i;
 467
 468         /* determine utf length in bytes and allocate memory */
 469
 470         buflength = u2_utflength(unicode_pos, unicode_length);
 471         buffer    = MNEW(char, buflength);
 472
 473         left = buflength;
 474         pos  = buffer;
 475
 476         for (i = 0; i++ < unicode_length; unicode_pos++) {
 477                 /* next unicode character */
 478                 u2 c = *unicode_pos;
 479
 480                 if ((c != 0) && (c < 0x80)) {
 481                         /* 1 character */
 482                         left--;
 483                 if ((int) left < 0) break;
 484                         /* convert classname */
 485                         if (isclassname && c == '.')
 486                                 *pos++ = '/';
 487                         else
 488                                 *pos++ = (char) c;
 489
 490                 } else if (c < 0x800) {
 491                         /* 2 characters */
 492                 unsigned char high = c >> 6;
 493                 unsigned char low  = c & 0x3F;
 494                         left = left - 2;
 495                 if ((int) left < 0) break;
 496                 *pos++ = high | 0xC0;
 497                 *pos++ = low  | 0x80;
 498
 499                 } else {
 500                 /* 3 characters */
 501                 char low  = c & 0x3f;
 502                 char mid  = (c >> 6) & 0x3F;
 503                 char high = c >> 12;
 504                         left = left - 3;
 505                 if ((int) left < 0) break;
 506                 *pos++ = high | 0xE0;
 507                 *pos++ = mid  | 0x80;
 508                 *pos++ = low  | 0x80;
 509                 }
 510         }
 511
 512         /* insert utf-string into symbol-table */
 513         result = utf_new(buffer,buflength);
 514
 515         MFREE(buffer, char, buflength);
 516
 517         return result;
 518 }
 519
 520
 521 /* utf_new_char ****************************************************************
 522
 523    Creates a new utf symbol, the text for this symbol is passed as a
 524    c-string ( = char* ).
 525
 526 *******************************************************************************/
 527
 528 utf *utf_new_char(const char *text)
 529 {
 530         return utf_new(text, strlen(text));
 531 }
 532
 533
 534 /* utf_new_char_classname ******************************************************
 535
 536    Creates a new utf symbol, the text for this symbol is passed as a
 537    c-string ( = char* ) "." characters are going to be replaced by
 538    "/". Since the above function is used often, this is a separte
 539    function, instead of an if.
 540
 541 *******************************************************************************/
 542
 543 utf *utf_new_char_classname(const char *text)
 544 {
 545         if (strchr(text, '.')) {
 546                 char *txt = strdup(text);
 547                 char *end = txt + strlen(txt);
 548                 char *c;
 549                 utf *tmpRes;
 550
 551                 for (c = txt; c < end; c++)
 552                         if (*c == '.') *c = '/';
 553
 554                 tmpRes = utf_new(txt, strlen(txt));
 555                 FREE(txt, 0);
 556
 557                 return tmpRes;
 558
 559         } else
 560                 return utf_new(text, strlen(text));
 561 }
 562
 563
 564 /* utf_nextu2 ******************************************************************
 565
 566    Read the next unicode character from the utf string and increment
 567    the utf-string pointer accordingly.
 568
 569 *******************************************************************************/
 570
 571 u2 utf_nextu2(char **utf_ptr)
 572 {
 573     /* uncompressed unicode character */
 574     u2 unicode_char = 0;
 575     /* current position in utf text */
 576     unsigned char *utf = (unsigned char *) (*utf_ptr);
 577     /* bytes representing the unicode character */
 578     unsigned char ch1, ch2, ch3;
 579     /* number of bytes used to represent the unicode character */
 580     int len = 0;
 581
 582     switch ((ch1 = utf[0]) >> 4) {
 583         default: /* 1 byte */
 584                 (*utf_ptr)++;
 585                 return (u2) ch1;
 586         case 0xC:
 587         case 0xD: /* 2 bytes */
 588                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 589                         unsigned char high = ch1 & 0x1F;
 590                         unsigned char low  = ch2 & 0x3F;
 591                         unicode_char = (high << 6) + low;
 592                         len = 2;
 593                 }
 594                 break;
 595
 596         case 0xE: /* 2 or 3 bytes */
 597                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 598                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 599                                 unsigned char low  = ch3 & 0x3f;
 600                                 unsigned char mid  = ch2 & 0x3f;
 601                                 unsigned char high = ch1 & 0x0f;
 602                                 unicode_char = (((high << 6) + mid) << 6) + low;
 603                                 len = 3;
 604                         } else
 605                                 len = 2;
 606                 }
 607                 break;
 608     }
 609
 610     /* update position in utf-text */
 611     *utf_ptr = (char *) (utf + len);
 612
 613     return unicode_char;
 614 }
 615
 616
 617 /* utf_strlen ******************************************************************
 618
 619    Determine number of unicode characters in the utf string.
 620
 621 *******************************************************************************/
 622
 623 u4 utf_strlen(utf *u)
 624 {
 625         char *endpos;                       /* points behind utf string           */
 626         char *utf_ptr;                      /* current position in utf text       */
 627         u4 len = 0;                         /* number of unicode characters       */
 628
 629         if (!u) {
 630                 *exceptionptr = new_nullpointerexception();
 631                 return 0;
 632         }
 633
 634         endpos = utf_end(u);
 635         utf_ptr = u->text;
 636
 637         while (utf_ptr < endpos) {
 638                 len++;
 639                 /* next unicode character */
 640                 utf_nextu2(&utf_ptr);
 641         }
 642
 643         if (utf_ptr != endpos)
 644                 /* string ended abruptly */
 645                 throw_cacao_exception_exit(string_java_lang_InternalError,
 646                                                                    "Illegal utf8 string");
 647
 648         return len;
 649 }
 650
 651
 652 /* u2_utflength ****************************************************************
 653
 654    Returns the utf length in bytes of a u2 array.
 655
 656 *******************************************************************************/
 657
 658 u4 u2_utflength(u2 *text, u4 u2_length)
 659 {
 660         u4 result_len = 0;                  /* utf length in bytes                */
 661         u2 ch;                              /* current unicode character          */
 662         u4 len;
 663
 664         for (len = 0; len < u2_length; len++) {
 665                 /* next unicode character */
 666                 ch = *text++;
 667
 668                 /* determine bytes required to store unicode character as utf */
 669                 if (ch && (ch < 0x80))
 670                         result_len++;
 671                 else if (ch < 0x800)
 672                         result_len += 2;
 673                 else
 674                         result_len += 3;
 675         }
 676
 677     return result_len;
 678 }
 679
 680
 681 /* utf_display *****************************************************************
 682
 683    Write utf symbol to stdout (for debugging purposes).
 684
 685 *******************************************************************************/
 686
 687 void utf_display(utf *u)
 688 {
 689         char *endpos;                       /* points behind utf string           */
 690         char *utf_ptr;                      /* current position in utf text       */
 691
 692         if (!u) {
 693                 printf("NULL");
 694                 fflush(stdout);
 695                 return;
 696         }
 697
 698         endpos = utf_end(u);
 699         utf_ptr = u->text;
 700
 701         while (utf_ptr < endpos) {
 702                 /* read next unicode character */
 703                 u2 c = utf_nextu2(&utf_ptr);
 704                 if (c >= 32 && c <= 127) printf("%c", c);
 705                 else printf("?");
 706         }
 707
 708         fflush(stdout);
 709 }
 710
 711
 712 /* utf_display_classname *******************************************************
 713
 714    Write utf symbol to stdout with `/' converted to `.' (for debugging
 715    purposes).
 716
 717 *******************************************************************************/
 718
 719 void utf_display_classname(utf *u)
 720 {
 721         char *endpos;                       /* points behind utf string           */
 722         char *utf_ptr;                      /* current position in utf text       */
 723
 724         if (!u) {
 725                 printf("NULL");
 726                 fflush(stdout);
 727                 return;
 728         }
 729
 730         endpos = utf_end(u);
 731         utf_ptr = u->text;
 732
 733         while (utf_ptr < endpos) {
 734                 /* read next unicode character */
 735                 u2 c = utf_nextu2(&utf_ptr);
 736                 if (c == '/') c = '.';
 737                 if (c >= 32 && c <= 127) printf("%c", c);
 738                 else printf("?");
 739         }
 740
 741         fflush(stdout);
 742 }
 743
 744
 745 /* utf_sprint ******************************************************************
 746
 747    Write utf symbol into c-string (for debugging purposes).
 748
 749 *******************************************************************************/
 750
 751 void utf_sprint(char *buffer, utf *u)
 752 {
 753         char *endpos;                       /* points behind utf string           */
 754         char *utf_ptr;                      /* current position in utf text       */
 755         u2 pos = 0;                         /* position in c-string               */
 756
 757         if (!u) {
 758                 strcpy(buffer, "NULL");
 759                 return;
 760         }
 761
 762         endpos = utf_end(u);
 763         utf_ptr = u->text;
 764
 765         while (utf_ptr < endpos)
 766                 /* copy next unicode character */
 767                 buffer[pos++] = utf_nextu2(&utf_ptr);
 768
 769         /* terminate string */
 770         buffer[pos] = '\0';
 771 }
 772
 773
 774 /* utf_sprint_classname ********************************************************
 775
 776    Write utf symbol into c-string with `/' converted to `.' (for debugging
 777    purposes).
 778
 779 *******************************************************************************/
 780
 781 void utf_sprint_classname(char *buffer, utf *u)
 782 {
 783         char *endpos;                       /* points behind utf string           */
 784         char *utf_ptr;                      /* current position in utf text       */
 785         u2 pos = 0;                         /* position in c-string               */
 786
 787         if (!u) {
 788                 strcpy(buffer, "NULL");
 789                 return;
 790         }
 791
 792         endpos = utf_end(u);
 793         utf_ptr = u->text;
 794
 795         while (utf_ptr < endpos) {
 796                 /* copy next unicode character */
 797                 u2 c = utf_nextu2(&utf_ptr);
 798                 if (c == '/') c = '.';
 799                 buffer[pos++] = c;
 800         }
 801
 802         /* terminate string */
 803         buffer[pos] = '\0';
 804 }
 805
 806
 807 /* utf_strcat ******************************************************************
 808
 809    Like libc strcat, but uses an utf8 string.
 810
 811 *******************************************************************************/
 812
 813 void utf_strcat(char *buffer, utf *u)
 814 {
 815         utf_sprint(buffer + strlen(buffer), u);
 816 }
 817
 818
 819 /* utf_strcat_classname ********************************************************
 820
 821    Like libc strcat, but uses an utf8 string.
 822
 823 *******************************************************************************/
 824
 825 void utf_strcat_classname(char *buffer, utf *u)
 826 {
 827         utf_sprint_classname(buffer + strlen(buffer), u);
 828 }
 829
 830
 831 /* utf_fprint ******************************************************************
 832
 833    Write utf symbol into file.
 834
 835 *******************************************************************************/
 836
 837 void utf_fprint(FILE *file, utf *u)
 838 {
 839         char *endpos;                       /* points behind utf string           */
 840         char *utf_ptr;                      /* current position in utf text       */
 841
 842         if (!u)
 843                 return;
 844
 845         endpos = utf_end(u);
 846         utf_ptr = u->text;
 847
 848         while (utf_ptr < endpos) {
 849                 /* read next unicode character */
 850                 u2 c = utf_nextu2(&utf_ptr);
 851
 852                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 853                 else fprintf(file, "?");
 854         }
 855 }
 856
 857
 858 /* utf_fprint_classname ********************************************************
 859
 860    Write utf symbol into file with `/' converted to `.'.
 861
 862 *******************************************************************************/
 863
 864 void utf_fprint_classname(FILE *file, utf *u)
 865 {
 866         char *endpos;                       /* points behind utf string           */
 867         char *utf_ptr;                      /* current position in utf text       */
 868
 869     if (!u)
 870                 return;
 871
 872         endpos = utf_end(u);
 873         utf_ptr = u->text;
 874
 875         while (utf_ptr < endpos) {
 876                 /* read next unicode character */
 877                 u2 c = utf_nextu2(&utf_ptr);
 878                 if (c == '/') c = '.';
 879
 880                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 881                 else fprintf(file, "?");
 882         }
 883 }
 884
 885
 886 /* is_valid_utf ****************************************************************
 887
 888    Return true if the given string is a valid UTF-8 string.
 889
 890    utf_ptr...points to first character
 891    end_pos...points after last character
 892
 893 *******************************************************************************/
 894
 895 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 896
 897 bool is_valid_utf(char *utf_ptr, char *end_pos)
 898 {
 899         int bytes;
 900         int len,i;
 901         char c;
 902         unsigned long v;
 903
 904         if (end_pos < utf_ptr) return false;
 905         bytes = end_pos - utf_ptr;
 906         while (bytes--) {
 907                 c = *utf_ptr++;
 908
 909                 if (!c) return false;                     /* 0x00 is not allowed */
 910                 if ((c & 0x80) == 0) continue;            /* ASCII */
 911
 912                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 913                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 914                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 915                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 916                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 917                 else return false;                        /* invalid leading byte */
 918
 919                 if (len > 2) return false;                /* Java limitation */
 920
 921                 v = (unsigned long)c & (0x3f >> len);
 922
 923                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 924
 925                 for (i = len; i--; ) {
 926                         c = *utf_ptr++;
 927                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 928                                 return false;
 929                         v = (v << 6) | (c & 0x3f);
 930                 }
 931
 932                 if (v == 0) {
 933                         if (len != 1) return false;           /* Java special */
 934
 935                 } else {
 936                         /* Sun Java seems to allow overlong UTF-8 encodings */
 937
 938                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 939                                 if (!opt_liberalutf)
 940                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 941                                 /* XXX change this to panic? */
 942                         }
 943                 }
 944
 945                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 946                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 947
 948                 /* even these seem to be allowed */
 949                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 950         }
 951
 952         return true;
 953 }
 954
 955
 956 /* is_valid_name ***************************************************************
 957
 958    Return true if the given string may be used as a class/field/method
 959    name. (Currently this only disallows empty strings and control
 960    characters.)
 961
 962    NOTE: The string is assumed to have passed is_valid_utf!
 963
 964    utf_ptr...points to first character
 965    end_pos...points after last character
 966
 967 *******************************************************************************/
 968
 969 bool is_valid_name(char *utf_ptr, char *end_pos)
 970 {
 971         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 972
 973         while (utf_ptr < end_pos) {
 974                 unsigned char c = *utf_ptr++;
 975
 976                 if (c < 0x20) return false; /* disallow control characters */
 977                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
 978                         return false;
 979         }
 980
 981         return true;
 982 }
 983
 984 bool is_valid_name_utf(utf *u)
 985 {
 986         return is_valid_name(u->text,utf_end(u));
 987 }
 988
 989
 990 /* utf_show ********************************************************************
 991
 992    Writes the utf symbols in the utfhash to stdout and displays the
 993    number of external hash chains grouped according to the chainlength
 994    (for debugging purposes).
 995
 996 *******************************************************************************/
 997
 998 void utf_show(void)
 999 {
1000
1001 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1002
1003         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1004         u4 max_chainlength = 0;      /* maximum length of the chains */
1005         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1006         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1007         u4 i;
1008
1009         printf ("UTF-HASH:\n");
1010
1011         /* show element of utf-hashtable */
1012         for (i=0; i<utf_hash.size; i++) {
1013                 utf *u = utf_hash.ptr[i];
1014                 if (u) {
1015                         printf ("SLOT %d: ", (int) i);
1016                         while (u) {
1017                                 printf ("'");
1018                                 utf_display (u);
1019                                 printf ("' ");
1020                                 u = u->hashlink;
1021                         }
1022                         printf ("\n");
1023                 }
1024
1025         }
1026
1027         printf ("UTF-HASH: %d slots for %d entries\n",
1028                         (int) utf_hash.size, (int) utf_hash.entries );
1029
1030
1031         if (utf_hash.entries == 0)
1032                 return;
1033
1034         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1035
1036         for (i=0;i<CHAIN_LIMIT;i++)
1037                 chain_count[i]=0;
1038
1039         /* count numbers of hashchains according to their length */
1040         for (i=0; i<utf_hash.size; i++) {
1041
1042                 utf *u = (utf*) utf_hash.ptr[i];
1043                 u4 chain_length = 0;
1044
1045                 /* determine chainlength */
1046                 while (u) {
1047                         u = u->hashlink;
1048                         chain_length++;
1049                 }
1050
1051                 /* update sum of all chainlengths */
1052                 sum_chainlength+=chain_length;
1053
1054                 /* determine the maximum length of the chains */
1055                 if (chain_length>max_chainlength)
1056                         max_chainlength = chain_length;
1057
1058                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1059                 if (chain_length>=CHAIN_LIMIT) {
1060                         beyond_limit+=chain_length;
1061                         chain_length=CHAIN_LIMIT-1;
1062                 }
1063
1064                 /* update number of hashchains of current length */
1065                 chain_count[chain_length]++;
1066         }
1067
1068         /* display results */
1069         for (i=1;i<CHAIN_LIMIT-1;i++)
1070                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1071
1072         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1073
1074
1075         printf("max. chainlength:%5d\n",max_chainlength);
1076
1077         /* avg. chainlength = sum of chainlengths / number of chains */
1078         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1079 }
1080
1081
1082 /*
1083  * These are local overrides for various environment variables in Emacs.
1084  * Please do not remove this and leave it at the end of the file, where
1085  * Emacs will automagically detect them.
1086  * ---------------------------------------------------------------------
1087  * Local variables:
1088  * mode: c
1089  * indent-tabs-mode: t
1090  * c-basic-offset: 4
1091  * tab-width: 4
1092  * End:
1093  */